From bd44e6c1d0406aca9812569574a0bfc99e62de2e Mon Sep 17 00:00:00 2001 From: rdjjke Date: Thu, 14 Nov 2024 14:00:07 +0000 Subject: [PATCH 1/5] MSP-3313: Add NSYS profiling in GPT3 mlperf implementation --- .../mlperf/gpt3-impl-4.0-nvidia/Dockerfile | 15 +++++++++++ soperator/mlperf/gpt3-impl-4.0-nvidia/VERSION | 2 +- soperator/mlperf/gpt3-impl-4.0-nvidia/run.sub | 1 + .../mlperf/gpt3-impl-4.0-nvidia/start.sh | 25 ++++++++++++++++--- 4 files changed, 38 insertions(+), 5 deletions(-) diff --git a/soperator/mlperf/gpt3-impl-4.0-nvidia/Dockerfile b/soperator/mlperf/gpt3-impl-4.0-nvidia/Dockerfile index 9d1f22ab..11377040 100644 --- a/soperator/mlperf/gpt3-impl-4.0-nvidia/Dockerfile +++ b/soperator/mlperf/gpt3-impl-4.0-nvidia/Dockerfile @@ -121,6 +121,21 @@ RUN rm -rf /opt/hpcx/nccl_rdma_sharp_plugin && \ RUN pip install huggingface_hub==0.23.2 RUN pip install -v "transformers<=4.40.2" +## Reinstall NCCL to the latest version +#RUN wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb +#RUN dpkg -i cuda-keyring_1.1-1_all.deb +#RUN apt-get update +#RUN apt install libnccl2=2.23.4-1+cuda12.4 libnccl-dev=2.23.4-1+cuda12.4 + +## Install NCCL profiler plugin +#RUN git clone https://github.com/NVIDIA/nccl && \ +# cd nccl && \ +# git checkout v2.23.4-1 && \ +# cd ext-profiler/example && \ +# make && \ +# cp libnccl-profiler.so /usr/lib/x86_64-linux-gnu/ + + # Benchmark code WORKDIR /workspace/llm diff --git a/soperator/mlperf/gpt3-impl-4.0-nvidia/VERSION b/soperator/mlperf/gpt3-impl-4.0-nvidia/VERSION index 9c38d380..e2c2ff71 100644 --- a/soperator/mlperf/gpt3-impl-4.0-nvidia/VERSION +++ b/soperator/mlperf/gpt3-impl-4.0-nvidia/VERSION @@ -1 +1 @@ -4.0-16 +4.0-20 diff --git a/soperator/mlperf/gpt3-impl-4.0-nvidia/run.sub b/soperator/mlperf/gpt3-impl-4.0-nvidia/run.sub index b577694f..43b59658 100755 --- a/soperator/mlperf/gpt3-impl-4.0-nvidia/run.sub +++ b/soperator/mlperf/gpt3-impl-4.0-nvidia/run.sub @@ -235,6 +235,7 @@ cleanup_preload_shared() { if [ -n "${CONTAINER_PRELOAD_SHARED_PATH}" ]; then CONT_FILE="${CONTAINER_PRELOAD_SHARED_PATH}/containers/${SLURM_JOBID}_$(basename ${CONT}).squashfs" # Prepull container image to the shared filesystem + mkdir -p "${CONTAINER_PRELOAD_SHARED_PATH}/containers" srun --ntasks=1 enroot import --output ${CONT_FILE} docker://${CONT} else CONT_FILE=${CONT} diff --git a/soperator/mlperf/gpt3-impl-4.0-nvidia/start.sh b/soperator/mlperf/gpt3-impl-4.0-nvidia/start.sh index 7b66f445..acd5bafc 100755 --- a/soperator/mlperf/gpt3-impl-4.0-nvidia/start.sh +++ b/soperator/mlperf/gpt3-impl-4.0-nvidia/start.sh @@ -6,7 +6,7 @@ usage() { echo "usage: ${0} -N [-w ] [-c ]" >&2 echo " [-e ]" >&2 echo " [-i ] [-D ] [-C ] [-R ] [-S ]" >&2 - echo " [-q (quick_start)] [-r (remove_prev_logs)] [-d (debug)] [-h (help)]" >&2 + echo " [-q (quick_start)] [-r (remove_prev_logs)] [-d (debug)] [-p (nsys_profiling)] [-h (help)]" >&2 exit 1 } @@ -16,7 +16,7 @@ dataset_dir="/mlperf-data/gpt3-dataset-4.0" checkpoint_dir="/mlperf-data/gpt3-checkpoint-4.0" result_dir="./result" -while getopts N:w:c:e:i:D:C:R:S:qrdh flag +while getopts N:w:c:e:i:D:C:R:S:qrdph flag do case "${flag}" in N) nodes=${OPTARG};; @@ -31,6 +31,7 @@ do q) quick_start=1;; r) rmlogs=1;; d) debug=1;; + p) nsys_profiling=1;; h) usage;; *) usage;; esac @@ -116,6 +117,18 @@ if [[ $debug -eq 1 ]]; then export GDRCOPY_LOG_LEVEL=1 fi +if [[ $nsys_profiling -eq 1 ]]; then + # Configure NSYS profiler + export NVTX_FLAG=1 + export PROFILE=True + export PROFILE_START_STEP=10 + export PROFILE_END_STEP=11 + export PROFILE_RANKS="0,1,2,3,4,5,6,7" + + # Early stopping: + export TARGET_LOG_PPL=2.75 +fi + if [ -z "${experiment}" ]; then job_name="gpt3" job_output="gpt3-%j.out" @@ -124,14 +137,18 @@ else job_output="gpt3-%j-${experiment}.out" fi +node_allocation="--nodes=${nodes}" +if [ -n "${nodelist}" ]; then + node_allocation="--nodelist='${nodelist}'" +fi + echo "Submit Slurm job" sbatch \ -t $WALLTIME \ -J "${job_name}" \ --output="${job_output}" \ --export=ALL \ - --nodes="${nodes}" \ - --nodelist="${nodelist}" \ + ${node_allocation} \ --ntasks-per-node="${SBATCH_GPUS_PER_NODE}" \ ${EXCLUSIVE:+--exclusive} \ run.sub From 197a5f5aa6f305b9579bbfdad06297bd173064c3 Mon Sep 17 00:00:00 2001 From: rdjjke Date: Sun, 17 Nov 2024 17:27:07 +0000 Subject: [PATCH 2/5] Add configs for H200 nodes to GPT3 impl --- ...ODEx64_TPx2_PPx8_VPx4_MINBSx128_MICBSx2.sh | 86 +++++++++++++++++++ .../config_H200x8_NODEx64_default.sh | 1 + ...NODEx8_TPx4_PPx8_VPx4_MINBSx128_MICBSx2.sh | 86 +++++++++++++++++++ ...ODEx8_TPx8_PPx8_VPx4_MINBSx3072_MICBSx1.sh | 86 +++++++++++++++++++ .../config_H200x8_NODEx8_default.sh | 1 + 5 files changed, 260 insertions(+) create mode 100644 soperator/mlperf/gpt3-impl-4.0-nvidia/config_H200x8_NODEx64_TPx2_PPx8_VPx4_MINBSx128_MICBSx2.sh create mode 120000 soperator/mlperf/gpt3-impl-4.0-nvidia/config_H200x8_NODEx64_default.sh create mode 100644 soperator/mlperf/gpt3-impl-4.0-nvidia/config_H200x8_NODEx8_TPx4_PPx8_VPx4_MINBSx128_MICBSx2.sh create mode 100644 soperator/mlperf/gpt3-impl-4.0-nvidia/config_H200x8_NODEx8_TPx8_PPx8_VPx4_MINBSx3072_MICBSx1.sh create mode 120000 soperator/mlperf/gpt3-impl-4.0-nvidia/config_H200x8_NODEx8_default.sh diff --git a/soperator/mlperf/gpt3-impl-4.0-nvidia/config_H200x8_NODEx64_TPx2_PPx8_VPx4_MINBSx128_MICBSx2.sh b/soperator/mlperf/gpt3-impl-4.0-nvidia/config_H200x8_NODEx64_TPx2_PPx8_VPx4_MINBSx128_MICBSx2.sh new file mode 100644 index 00000000..fb48cf75 --- /dev/null +++ b/soperator/mlperf/gpt3-impl-4.0-nvidia/config_H200x8_NODEx64_TPx2_PPx8_VPx4_MINBSx128_MICBSx2.sh @@ -0,0 +1,86 @@ +#!/bin/bash + +# DL params +export DGXNNODES="${DGXNNODES:=64}" # NODEx64 +export TENSOR_MODEL_PARALLEL="${TENSOR_MODEL_PARALLEL:=2}" # TPx2 (training.model.tensor_model_parallel_size) +export PIPELINE_MODEL_PARALLEL="${PIPELINE_MODEL_PARALLEL:=8}" # PPx8 (training.model.pipeline_model_parallel_size) +export INTERLEAVED_PIPELINE="${INTERLEAVED_PIPELINE:=4}" # VPx4 +export MINIBS="${MINIBS:=128}" # MINBSx128 +export MICRO_BATCH_SIZE="${MICRO_BATCH_SIZE:=2}" # MICBSx2 + +# Check DL params +# Rule: GBS % (DP * PP * MICRO_BATCH_SIZE) == 0 +# This simplifies to MINIBS % PP == 0 +if [[ $(($MINIBS % PIPELINE_MODEL_PARALLEL)) != 0 ]]; then + echo "MINIBS should be divisble by PP" + exit 1 +fi + + + +# Slurm resource allocation +export SBATCH_GPUS_PER_NODE="8" +export SBATCH_MEM_PER_NODE="1200G" +export SBATCH_TRES_PER_TASK="cpu=16" +export SBATCH_DISTRIBUTION="block:block:block" +export SLURM_CPU_BIND="verbose,none" +#export EXCLUSIVE=1 + +# Use bindpcie CPU pinning +export ENABLE_CPU_EXCLUSIVE=1 +export ENABLE_IB_BINDING=1 + + + + +# Job time limit +export WALLTIME_MINUTES=1200 +export WALLTIME=$(( (${NEXP:-1} * WALLTIME_MINUTES) )) + + + +# Use of userbuffer backend to overlap tensor-parallel communications with computes (training.model.ub_tp_comm_overlap). +export TP_COMM_OVERLAP=True + +# Execute of nvidia-smi boost-slider --vboost +export VBOOST_VALUE=1 + +# Set MaxQ and MinEDP clocks +export SET_MAXQ_CLK=0 +export MAXQ_CLK="" +export SET_MINEDP_CLK=0 +export MINEDP_CLK="" + +# Set power limit +export SET_POWER_CAP=0 +export POWER_CAP="" + +# Use CPU offloading (activations & weights). +export CPU_OFFLOADING=False + +# Load the minimal number of samples +export LOAD_MINIMAL_NUM_SAMPLES=0 + +# Load distributed checkpoint directly on GPU +export LOAD_DIRECTLY_ON_DEVICE=0 + + + +# Extract system name +export DGXSYSTEM=$(basename $(readlink -f ${BASH_SOURCE[0]}) | sed 's/^config_//' | sed 's/\.sh$//' ) + + + +# Configure mlperf SYSJSON logging +export MLPERF_SUBMITTER="Nebius" +export MLPERF_SYSTEM_NAME="${DGXSYSTEM}" +export MLPERF_STATUS="cloud" + + + +# Apply common settings +source $(dirname ${BASH_SOURCE[0]})/config_common.sh + +# Apply FP8 settings +source $(dirname ${BASH_SOURCE[0]})/config_fp8.sh + diff --git a/soperator/mlperf/gpt3-impl-4.0-nvidia/config_H200x8_NODEx64_default.sh b/soperator/mlperf/gpt3-impl-4.0-nvidia/config_H200x8_NODEx64_default.sh new file mode 120000 index 00000000..bbe6159f --- /dev/null +++ b/soperator/mlperf/gpt3-impl-4.0-nvidia/config_H200x8_NODEx64_default.sh @@ -0,0 +1 @@ +config_H200x8_NODEx64_TPx2_PPx8_VPx4_MINBSx128_MICBSx2.sh \ No newline at end of file diff --git a/soperator/mlperf/gpt3-impl-4.0-nvidia/config_H200x8_NODEx8_TPx4_PPx8_VPx4_MINBSx128_MICBSx2.sh b/soperator/mlperf/gpt3-impl-4.0-nvidia/config_H200x8_NODEx8_TPx4_PPx8_VPx4_MINBSx128_MICBSx2.sh new file mode 100644 index 00000000..b006bdd6 --- /dev/null +++ b/soperator/mlperf/gpt3-impl-4.0-nvidia/config_H200x8_NODEx8_TPx4_PPx8_VPx4_MINBSx128_MICBSx2.sh @@ -0,0 +1,86 @@ +#!/bin/bash + +# DL params +export DGXNNODES="${DGXNNODES:=8}" # NODEx8 +export TENSOR_MODEL_PARALLEL="${TENSOR_MODEL_PARALLEL:=4}" # TPx4 (training.model.tensor_model_parallel_size) +export PIPELINE_MODEL_PARALLEL="${PIPELINE_MODEL_PARALLEL:=8}" # PPx8 (training.model.pipeline_model_parallel_size) +export INTERLEAVED_PIPELINE="${INTERLEAVED_PIPELINE:=4}" # VPx4 +export MINIBS="${MINIBS:=128}" # MINBSx128 +export MICRO_BATCH_SIZE="${MICRO_BATCH_SIZE:=2}" # MICBSx2 + +# Check DL params +# Rule: GBS % (DP * PP * MICRO_BATCH_SIZE) == 0 +# This simplifies to MINIBS % PP == 0 +if [[ $(($MINIBS % PIPELINE_MODEL_PARALLEL)) != 0 ]]; then + echo "MINIBS should be divisble by PP" + exit 1 +fi + + + +# Slurm resource allocation +export SBATCH_GPUS_PER_NODE="8" +export SBATCH_MEM_PER_NODE="1200G" +export SBATCH_TRES_PER_TASK="cpu=16" +export SBATCH_DISTRIBUTION="block:block:block" +export SLURM_CPU_BIND="verbose,none" +#export EXCLUSIVE=1 + +# Use bindpcie CPU pinning +export ENABLE_CPU_EXCLUSIVE=1 +export ENABLE_IB_BINDING=1 + + + + +# Job time limit +export WALLTIME_MINUTES=1200 +export WALLTIME=$(( (${NEXP:-1} * WALLTIME_MINUTES) )) + + + +# Use of userbuffer backend to overlap tensor-parallel communications with computes (training.model.ub_tp_comm_overlap). +export TP_COMM_OVERLAP=True + +# Execute of nvidia-smi boost-slider --vboost +export VBOOST_VALUE=1 + +# Set MaxQ and MinEDP clocks +export SET_MAXQ_CLK=0 +export MAXQ_CLK="" +export SET_MINEDP_CLK=0 +export MINEDP_CLK="" + +# Set power limit +export SET_POWER_CAP=0 +export POWER_CAP="" + +# Use CPU offloading (activations & weights). +export CPU_OFFLOADING=False + +# Load the minimal number of samples +export LOAD_MINIMAL_NUM_SAMPLES=0 + +# Load distributed checkpoint directly on GPU +export LOAD_DIRECTLY_ON_DEVICE=0 + + + +# Extract system name +export DGXSYSTEM=$(basename $(readlink -f ${BASH_SOURCE[0]}) | sed 's/^config_//' | sed 's/\.sh$//' ) + + + +# Configure mlperf SYSJSON logging +export MLPERF_SUBMITTER="Nebius" +export MLPERF_SYSTEM_NAME="${DGXSYSTEM}" +export MLPERF_STATUS="cloud" + + + +# Apply common settings +source $(dirname ${BASH_SOURCE[0]})/config_common.sh + +# Apply FP8 settings +source $(dirname ${BASH_SOURCE[0]})/config_fp8.sh + diff --git a/soperator/mlperf/gpt3-impl-4.0-nvidia/config_H200x8_NODEx8_TPx8_PPx8_VPx4_MINBSx3072_MICBSx1.sh b/soperator/mlperf/gpt3-impl-4.0-nvidia/config_H200x8_NODEx8_TPx8_PPx8_VPx4_MINBSx3072_MICBSx1.sh new file mode 100644 index 00000000..c8109ba5 --- /dev/null +++ b/soperator/mlperf/gpt3-impl-4.0-nvidia/config_H200x8_NODEx8_TPx8_PPx8_VPx4_MINBSx3072_MICBSx1.sh @@ -0,0 +1,86 @@ +#!/bin/bash + +# DL params +export DGXNNODES="${DGXNNODES:=8}" # NODEx8 +export TENSOR_MODEL_PARALLEL="${TENSOR_MODEL_PARALLEL:=8}" # TPx8 (training.model.tensor_model_parallel_size) +export PIPELINE_MODEL_PARALLEL="${PIPELINE_MODEL_PARALLEL:=8}" # PPx8 (training.model.pipeline_model_parallel_size) +export INTERLEAVED_PIPELINE="${INTERLEAVED_PIPELINE:=4}" # VPx4 +export MINIBS="${MINIBS:=3072}" # MINBSx3072 +export MICRO_BATCH_SIZE="${MICRO_BATCH_SIZE:=1}" # MICBSx1 + +# Check DL params +# Rule: GBS % (DP * PP * MICRO_BATCH_SIZE) == 0 +# This simplifies to MINIBS % PP == 0 +if [[ $(($MINIBS % PIPELINE_MODEL_PARALLEL)) != 0 ]]; then + echo "MINIBS should be divisble by PP" + exit 1 +fi + + + +# Slurm resource allocation +export SBATCH_GPUS_PER_NODE="8" +export SBATCH_MEM_PER_NODE="1200G" +export SBATCH_TRES_PER_TASK="cpu=16" +export SBATCH_DISTRIBUTION="block:block:block" +export SLURM_CPU_BIND="verbose,none" +#export EXCLUSIVE=1 + +# Use bindpcie CPU pinning +export ENABLE_CPU_EXCLUSIVE=1 +export ENABLE_IB_BINDING=1 + + + + +# Job time limit +export WALLTIME_MINUTES=1200 +export WALLTIME=$(( (${NEXP:-1} * WALLTIME_MINUTES) )) + + + +# Use of userbuffer backend to overlap tensor-parallel communications with computes (training.model.ub_tp_comm_overlap). +export TP_COMM_OVERLAP=True + +# Execute of nvidia-smi boost-slider --vboost +export VBOOST_VALUE=1 + +# Set MaxQ and MinEDP clocks +export SET_MAXQ_CLK=0 +export MAXQ_CLK="" +export SET_MINEDP_CLK=0 +export MINEDP_CLK="" + +# Set power limit +export SET_POWER_CAP=0 +export POWER_CAP="" + +# Use CPU offloading (activations & weights). +export CPU_OFFLOADING=False + +# Load the minimal number of samples +export LOAD_MINIMAL_NUM_SAMPLES=0 + +# Load distributed checkpoint directly on GPU +export LOAD_DIRECTLY_ON_DEVICE=0 + + + +# Extract system name +export DGXSYSTEM=$(basename $(readlink -f ${BASH_SOURCE[0]}) | sed 's/^config_//' | sed 's/\.sh$//' ) + + + +# Configure mlperf SYSJSON logging +export MLPERF_SUBMITTER="Nebius" +export MLPERF_SYSTEM_NAME="${DGXSYSTEM}" +export MLPERF_STATUS="cloud" + + + +# Apply common settings +source $(dirname ${BASH_SOURCE[0]})/config_common.sh + +# Apply FP8 settings +source $(dirname ${BASH_SOURCE[0]})/config_fp8.sh + diff --git a/soperator/mlperf/gpt3-impl-4.0-nvidia/config_H200x8_NODEx8_default.sh b/soperator/mlperf/gpt3-impl-4.0-nvidia/config_H200x8_NODEx8_default.sh new file mode 120000 index 00000000..4fef3305 --- /dev/null +++ b/soperator/mlperf/gpt3-impl-4.0-nvidia/config_H200x8_NODEx8_default.sh @@ -0,0 +1 @@ +config_H200x8_NODEx8_TPx4_PPx8_VPx4_MINBSx128_MICBSx2.sh \ No newline at end of file From f83971bd7127a733c61192db5a614c7932b49005 Mon Sep 17 00:00:00 2001 From: Pavel Sofronii Date: Wed, 20 Nov 2024 15:51:03 +0100 Subject: [PATCH 3/5] nccl_use_infiniband true and nccl_benchmark_min_threshold 45 --- soperator/installations/example/main.tf | 1 + soperator/installations/example/terraform.tfvars | 4 ++-- soperator/installations/example/variables.tf | 10 ++++++++-- soperator/modules/slurm/main.tf | 7 ++++--- .../templates/helm_values/slurm_cluster.yaml.tftpl | 1 + soperator/modules/slurm/variables.tf | 8 +++++++- 6 files changed, 23 insertions(+), 8 deletions(-) diff --git a/soperator/installations/example/main.tf b/soperator/installations/example/main.tf index a6d3993a..59c57207 100644 --- a/soperator/installations/example/main.tf +++ b/soperator/installations/example/main.tf @@ -255,6 +255,7 @@ module "slurm" { nccl_benchmark_enable = var.nccl_benchmark_enable nccl_benchmark_schedule = var.nccl_benchmark_schedule nccl_benchmark_min_threshold = var.nccl_benchmark_min_threshold + nccl_use_infiniband = var.nccl_use_infiniband telemetry_enabled = var.telemetry_enabled telemetry_grafana_admin_password = var.telemetry_grafana_admin_password diff --git a/soperator/installations/example/terraform.tfvars b/soperator/installations/example/terraform.tfvars index 408ac7b5..77b4ed8f 100644 --- a/soperator/installations/example/terraform.tfvars +++ b/soperator/installations/example/terraform.tfvars @@ -357,9 +357,9 @@ slurm_shared_memory_size_gibibytes = 256 # nccl_benchmark_enable = "0 */3 * * *" # Minimal threshold of NCCL benchmark for GPU performance to be considered as acceptable. -# By default, 420. +# By default, 45. # --- -# nccl_benchmark_min_threshold = 420 +# nccl_benchmark_min_threshold = 45 # endregion NCCL benchmark diff --git a/soperator/installations/example/variables.tf b/soperator/installations/example/variables.tf index b6fa95f5..b1a9a480 100644 --- a/soperator/installations/example/variables.tf +++ b/soperator/installations/example/variables.tf @@ -504,10 +504,16 @@ variable "nccl_benchmark_schedule" { variable "nccl_benchmark_min_threshold" { description = "Minimal threshold of NCCL benchmark for GPU performance to be considered as acceptable." type = number - default = 420 + default = 45 } -# region NCCL benchmark +variable "nccl_use_infiniband" { + description = "Use infiniband defines using NCCL_P2P_DISABLE=1 NCCL_SHM_DISABLE=1 NCCL_ALGO=Ring env variables for test." + type = bool + default = true +} + +# endregion NCCL benchmark # region Telemetry diff --git a/soperator/modules/slurm/main.tf b/soperator/modules/slurm/main.tf index 02670d78..d52d99e6 100644 --- a/soperator/modules/slurm/main.tf +++ b/soperator/modules/slurm/main.tf @@ -163,9 +163,10 @@ resource "helm_release" "slurm_cluster" { nccl_topology_type = var.nccl_topology_type nccl_benchmark = { - enable = var.nccl_benchmark_enable - schedule = var.nccl_benchmark_schedule - min_threshold = var.nccl_benchmark_min_threshold + enable = var.nccl_benchmark_enable + schedule = var.nccl_benchmark_schedule + min_threshold = var.nccl_benchmark_min_threshold + use_infiniband = var.nccl_use_infiniband } nodes = { diff --git a/soperator/modules/slurm/templates/helm_values/slurm_cluster.yaml.tftpl b/soperator/modules/slurm/templates/helm_values/slurm_cluster.yaml.tftpl index 66457052..c420d949 100644 --- a/soperator/modules/slurm/templates/helm_values/slurm_cluster.yaml.tftpl +++ b/soperator/modules/slurm/templates/helm_values/slurm_cluster.yaml.tftpl @@ -128,6 +128,7 @@ periodicChecks: schedule: "${nccl_benchmark.schedule}" ncclArguments: thresholdMoreThan: ${nccl_benchmark.min_threshold} + useInfiniband: ${nccl_benchmark.use_infiniband} slurmNodes: accounting: diff --git a/soperator/modules/slurm/variables.tf b/soperator/modules/slurm/variables.tf index 2eef573b..c9db7d15 100644 --- a/soperator/modules/slurm/variables.tf +++ b/soperator/modules/slurm/variables.tf @@ -206,7 +206,13 @@ variable "nccl_benchmark_schedule" { variable "nccl_benchmark_min_threshold" { description = "Minimal threshold of NCCL benchmark for GPU performance to be considered as acceptable." type = number - default = 420 + default = 45 +} + +variable "nccl_use_infiniband" { + description = "Use infiniband defines using NCCL_P2P_DISABLE=1 NCCL_SHM_DISABLE=1 NCCL_ALGO=Ring env variables for test." + type = bool + default = true } # endregion NCCL benchmark From 212e487dde5522c1b37e4f8e16d684e6c7e14a20 Mon Sep 17 00:00:00 2001 From: Pavel Sofronii Date: Wed, 20 Nov 2024 16:36:37 +0100 Subject: [PATCH 4/5] add nccl_use_infiniband to example --- soperator/installations/example/terraform.tfvars | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/soperator/installations/example/terraform.tfvars b/soperator/installations/example/terraform.tfvars index 77b4ed8f..2355dd0a 100644 --- a/soperator/installations/example/terraform.tfvars +++ b/soperator/installations/example/terraform.tfvars @@ -361,6 +361,11 @@ slurm_shared_memory_size_gibibytes = 256 # --- # nccl_benchmark_min_threshold = 45 +# Use infiniband defines using NCCL_P2P_DISABLE=1 NCCL_SHM_DISABLE=1 NCCL_ALGO=Ring env variables for test. +# By default, true +# --- +# nccl_use_infiniband = true + # endregion NCCL benchmark #----------------------------------------------------------------------------------------------------------------------# From 0ca10bf6e11c465ea764d7eb7f2ca9d05e806d55 Mon Sep 17 00:00:00 2001 From: Pavel Sofronii Date: Wed, 20 Nov 2024 17:02:04 +0100 Subject: [PATCH 5/5] bump soperator 1.15.3 --- soperator/VERSION | 2 +- soperator/installations/example/terraform.tfvars | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/soperator/VERSION b/soperator/VERSION index 42cf0675..f2380cc7 100644 --- a/soperator/VERSION +++ b/soperator/VERSION @@ -1 +1 @@ -1.15.2 +1.15.3 diff --git a/soperator/installations/example/terraform.tfvars b/soperator/installations/example/terraform.tfvars index 2355dd0a..a740d787 100644 --- a/soperator/installations/example/terraform.tfvars +++ b/soperator/installations/example/terraform.tfvars @@ -168,7 +168,7 @@ slurm_cluster_name = "my-amazing-slurm" # Version of soperator. # --- -slurm_operator_version = "1.15.2" +slurm_operator_version = "1.15.3" # Type of the Slurm partition config. Could be either `default` or `custom`. # By default, "default".