Merge pull request #93 from nebius/release/soperator

Release soperator 1.15.3
nebius · Nov 20, 2024 · 8584e20 · 8584e20
2 parents d4a95b6 + 090ec6f
commit 8584e20
Show file tree

Hide file tree

Showing 16 changed files with 328 additions and 15 deletions.
diff --git a/soperator/VERSION b/soperator/VERSION
@@ -1 +1 @@
-1.15.2
+1.15.3
diff --git a/soperator/installations/example/main.tf b/soperator/installations/example/main.tf
@@ -255,6 +255,7 @@ module "slurm" {
   nccl_benchmark_enable        = var.nccl_benchmark_enable
   nccl_benchmark_schedule      = var.nccl_benchmark_schedule
   nccl_benchmark_min_threshold = var.nccl_benchmark_min_threshold
+  nccl_use_infiniband          = var.nccl_use_infiniband
 
   telemetry_enabled                = var.telemetry_enabled
   telemetry_grafana_admin_password = var.telemetry_grafana_admin_password

diff --git a/soperator/installations/example/terraform.tfvars b/soperator/installations/example/terraform.tfvars
@@ -168,7 +168,7 @@ slurm_cluster_name = "my-amazing-slurm"
 
 # Version of soperator.
 # ---
-slurm_operator_version = "1.15.2"
+slurm_operator_version = "1.15.3"
 
 # Type of the Slurm partition config. Could be either `default` or `custom`.
 # By default, "default".
@@ -357,9 +357,14 @@ slurm_shared_memory_size_gibibytes = 256
 # nccl_benchmark_enable = "0 */3 * * *"
 
 # Minimal threshold of NCCL benchmark for GPU performance to be considered as acceptable.
-# By default, 420.
+# By default, 45.
 # ---
-# nccl_benchmark_min_threshold = 420
+# nccl_benchmark_min_threshold = 45
+
+# Use infiniband defines using NCCL_P2P_DISABLE=1 NCCL_SHM_DISABLE=1 NCCL_ALGO=Ring env variables for test.
+# By default, true
+# ---
+# nccl_use_infiniband = true
 
 # endregion NCCL benchmark
 

diff --git a/soperator/installations/example/variables.tf b/soperator/installations/example/variables.tf
@@ -504,10 +504,16 @@ variable "nccl_benchmark_schedule" {
 variable "nccl_benchmark_min_threshold" {
   description = "Minimal threshold of NCCL benchmark for GPU performance to be considered as acceptable."
   type        = number
-  default     = 420
+  default     = 45
 }
 
-# region NCCL benchmark
+variable "nccl_use_infiniband" {
+  description = "Use infiniband defines using NCCL_P2P_DISABLE=1 NCCL_SHM_DISABLE=1 NCCL_ALGO=Ring env variables for test."
+  type        = bool
+  default     = true
+}
+
+# endregion NCCL benchmark
 
 # region Telemetry
 

diff --git a/soperator/mlperf/gpt3-impl-4.0-nvidia/Dockerfile b/soperator/mlperf/gpt3-impl-4.0-nvidia/Dockerfile
@@ -121,6 +121,21 @@ RUN rm -rf /opt/hpcx/nccl_rdma_sharp_plugin && \
 RUN pip install huggingface_hub==0.23.2
 RUN pip install -v "transformers<=4.40.2"
 
+## Reinstall NCCL to the latest version
+#RUN wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb
+#RUN dpkg -i cuda-keyring_1.1-1_all.deb
+#RUN apt-get update
+#RUN apt install libnccl2=2.23.4-1+cuda12.4 libnccl-dev=2.23.4-1+cuda12.4
+
+## Install NCCL profiler plugin
+#RUN git clone https://github.com/NVIDIA/nccl && \
+#    cd nccl && \
+#    git checkout v2.23.4-1 && \
+#    cd ext-profiler/example && \
+#    make && \
+#    cp libnccl-profiler.so /usr/lib/x86_64-linux-gnu/
+
+
 # Benchmark code
 WORKDIR /workspace/llm
 

diff --git a/soperator/mlperf/gpt3-impl-4.0-nvidia/VERSION b/soperator/mlperf/gpt3-impl-4.0-nvidia/VERSION
@@ -1 +1 @@
-4.0-16
+4.0-20
diff --git a/...tor/mlperf/gpt3-impl-4.0-nvidia/config_H200x8_NODEx64_TPx2_PPx8_VPx4_MINBSx128_MICBSx2.sh b/...tor/mlperf/gpt3-impl-4.0-nvidia/config_H200x8_NODEx64_TPx2_PPx8_VPx4_MINBSx128_MICBSx2.sh
@@ -0,0 +1,86 @@
+#!/bin/bash
+
+# DL params
+export DGXNNODES="${DGXNNODES:=64}"                            # NODEx64
+export TENSOR_MODEL_PARALLEL="${TENSOR_MODEL_PARALLEL:=2}"     # TPx2 (training.model.tensor_model_parallel_size)
+export PIPELINE_MODEL_PARALLEL="${PIPELINE_MODEL_PARALLEL:=8}" # PPx8 (training.model.pipeline_model_parallel_size)
+export INTERLEAVED_PIPELINE="${INTERLEAVED_PIPELINE:=4}"       # VPx4
+export MINIBS="${MINIBS:=128}"                                 # MINBSx128
+export MICRO_BATCH_SIZE="${MICRO_BATCH_SIZE:=2}"               # MICBSx2
+
+# Check DL params
+# Rule: GBS % (DP * PP * MICRO_BATCH_SIZE) == 0
+# This simplifies to MINIBS % PP == 0
+if [[ $(($MINIBS % PIPELINE_MODEL_PARALLEL)) != 0 ]]; then
+	echo "MINIBS should be divisble by PP"
+	exit 1
+fi
+
+
+
+# Slurm resource allocation
+export SBATCH_GPUS_PER_NODE="8"
+export SBATCH_MEM_PER_NODE="1200G"
+export SBATCH_TRES_PER_TASK="cpu=16"
+export SBATCH_DISTRIBUTION="block:block:block"
+export SLURM_CPU_BIND="verbose,none"
+#export EXCLUSIVE=1
+
+# Use bindpcie CPU pinning
+export ENABLE_CPU_EXCLUSIVE=1
+export ENABLE_IB_BINDING=1
+
+
+
+
+# Job time limit
+export WALLTIME_MINUTES=1200
+export WALLTIME=$(( (${NEXP:-1} * WALLTIME_MINUTES) ))
+
+
+
+# Use of userbuffer backend to overlap tensor-parallel communications with computes (training.model.ub_tp_comm_overlap).
+export TP_COMM_OVERLAP=True
+
+# Execute of nvidia-smi boost-slider --vboost <value>
+export VBOOST_VALUE=1
+
+# Set MaxQ and MinEDP clocks
+export SET_MAXQ_CLK=0
+export MAXQ_CLK=""
+export SET_MINEDP_CLK=0
+export MINEDP_CLK=""
+
+# Set power limit
+export SET_POWER_CAP=0
+export POWER_CAP=""
+
+# Use CPU offloading (activations & weights).
+export CPU_OFFLOADING=False
+
+# Load the minimal number of samples
+export LOAD_MINIMAL_NUM_SAMPLES=0
+
+# Load distributed checkpoint directly on GPU
+export LOAD_DIRECTLY_ON_DEVICE=0
+
+
+
+# Extract system name
+export DGXSYSTEM=$(basename $(readlink -f ${BASH_SOURCE[0]}) | sed 's/^config_//' | sed 's/\.sh$//' )
+
+
+
+# Configure mlperf SYSJSON logging
+export MLPERF_SUBMITTER="Nebius"
+export MLPERF_SYSTEM_NAME="${DGXSYSTEM}"
+export MLPERF_STATUS="cloud"
+
+
+
+# Apply common settings
+source $(dirname ${BASH_SOURCE[0]})/config_common.sh
+
+# Apply FP8 settings
+source $(dirname ${BASH_SOURCE[0]})/config_fp8.sh
+
diff --git a/soperator/mlperf/gpt3-impl-4.0-nvidia/config_H200x8_NODEx64_default.sh b/soperator/mlperf/gpt3-impl-4.0-nvidia/config_H200x8_NODEx64_default.sh
@@ -0,0 +1 @@
+config_H200x8_NODEx64_TPx2_PPx8_VPx4_MINBSx128_MICBSx2.sh
diff --git a/...ator/mlperf/gpt3-impl-4.0-nvidia/config_H200x8_NODEx8_TPx4_PPx8_VPx4_MINBSx128_MICBSx2.sh b/...ator/mlperf/gpt3-impl-4.0-nvidia/config_H200x8_NODEx8_TPx4_PPx8_VPx4_MINBSx128_MICBSx2.sh
@@ -0,0 +1,86 @@
+#!/bin/bash
+
+# DL params
+export DGXNNODES="${DGXNNODES:=8}"                             # NODEx8
+export TENSOR_MODEL_PARALLEL="${TENSOR_MODEL_PARALLEL:=4}"     # TPx4 (training.model.tensor_model_parallel_size)
+export PIPELINE_MODEL_PARALLEL="${PIPELINE_MODEL_PARALLEL:=8}" # PPx8 (training.model.pipeline_model_parallel_size)
+export INTERLEAVED_PIPELINE="${INTERLEAVED_PIPELINE:=4}"       # VPx4
+export MINIBS="${MINIBS:=128}"                                 # MINBSx128
+export MICRO_BATCH_SIZE="${MICRO_BATCH_SIZE:=2}"               # MICBSx2
+
+# Check DL params
+# Rule: GBS % (DP * PP * MICRO_BATCH_SIZE) == 0
+# This simplifies to MINIBS % PP == 0
+if [[ $(($MINIBS % PIPELINE_MODEL_PARALLEL)) != 0 ]]; then
+	echo "MINIBS should be divisble by PP"
+	exit 1
+fi
+
+
+
+# Slurm resource allocation
+export SBATCH_GPUS_PER_NODE="8"
+export SBATCH_MEM_PER_NODE="1200G"
+export SBATCH_TRES_PER_TASK="cpu=16"
+export SBATCH_DISTRIBUTION="block:block:block"
+export SLURM_CPU_BIND="verbose,none"
+#export EXCLUSIVE=1
+
+# Use bindpcie CPU pinning
+export ENABLE_CPU_EXCLUSIVE=1
+export ENABLE_IB_BINDING=1
+
+
+
+
+# Job time limit
+export WALLTIME_MINUTES=1200
+export WALLTIME=$(( (${NEXP:-1} * WALLTIME_MINUTES) ))
+
+
+
+# Use of userbuffer backend to overlap tensor-parallel communications with computes (training.model.ub_tp_comm_overlap).
+export TP_COMM_OVERLAP=True
+
+# Execute of nvidia-smi boost-slider --vboost <value>
+export VBOOST_VALUE=1
+
+# Set MaxQ and MinEDP clocks
+export SET_MAXQ_CLK=0
+export MAXQ_CLK=""
+export SET_MINEDP_CLK=0
+export MINEDP_CLK=""
+
+# Set power limit
+export SET_POWER_CAP=0
+export POWER_CAP=""
+
+# Use CPU offloading (activations & weights).
+export CPU_OFFLOADING=False
+
+# Load the minimal number of samples
+export LOAD_MINIMAL_NUM_SAMPLES=0
+
+# Load distributed checkpoint directly on GPU
+export LOAD_DIRECTLY_ON_DEVICE=0
+
+
+
+# Extract system name
+export DGXSYSTEM=$(basename $(readlink -f ${BASH_SOURCE[0]}) | sed 's/^config_//' | sed 's/\.sh$//' )
+
+
+
+# Configure mlperf SYSJSON logging
+export MLPERF_SUBMITTER="Nebius"
+export MLPERF_SYSTEM_NAME="${DGXSYSTEM}"
+export MLPERF_STATUS="cloud"
+
+
+
+# Apply common settings
+source $(dirname ${BASH_SOURCE[0]})/config_common.sh
+
+# Apply FP8 settings
+source $(dirname ${BASH_SOURCE[0]})/config_fp8.sh
+
diff --git a/...tor/mlperf/gpt3-impl-4.0-nvidia/config_H200x8_NODEx8_TPx8_PPx8_VPx4_MINBSx3072_MICBSx1.sh b/...tor/mlperf/gpt3-impl-4.0-nvidia/config_H200x8_NODEx8_TPx8_PPx8_VPx4_MINBSx3072_MICBSx1.sh
@@ -0,0 +1,86 @@
+#!/bin/bash
+
+# DL params
+export DGXNNODES="${DGXNNODES:=8}"                             # NODEx8
+export TENSOR_MODEL_PARALLEL="${TENSOR_MODEL_PARALLEL:=8}"     # TPx8 (training.model.tensor_model_parallel_size)
+export PIPELINE_MODEL_PARALLEL="${PIPELINE_MODEL_PARALLEL:=8}" # PPx8 (training.model.pipeline_model_parallel_size)
+export INTERLEAVED_PIPELINE="${INTERLEAVED_PIPELINE:=4}"       # VPx4
+export MINIBS="${MINIBS:=3072}"                                # MINBSx3072
+export MICRO_BATCH_SIZE="${MICRO_BATCH_SIZE:=1}"               # MICBSx1
+
+# Check DL params
+# Rule: GBS % (DP * PP * MICRO_BATCH_SIZE) == 0
+# This simplifies to MINIBS % PP == 0
+if [[ $(($MINIBS % PIPELINE_MODEL_PARALLEL)) != 0 ]]; then
+	echo "MINIBS should be divisble by PP"
+	exit 1
+fi
+
+
+
+# Slurm resource allocation
+export SBATCH_GPUS_PER_NODE="8"
+export SBATCH_MEM_PER_NODE="1200G"
+export SBATCH_TRES_PER_TASK="cpu=16"
+export SBATCH_DISTRIBUTION="block:block:block"
+export SLURM_CPU_BIND="verbose,none"
+#export EXCLUSIVE=1
+
+# Use bindpcie CPU pinning
+export ENABLE_CPU_EXCLUSIVE=1
+export ENABLE_IB_BINDING=1
+
+
+
+
+# Job time limit
+export WALLTIME_MINUTES=1200
+export WALLTIME=$(( (${NEXP:-1} * WALLTIME_MINUTES) ))
+
+
+
+# Use of userbuffer backend to overlap tensor-parallel communications with computes (training.model.ub_tp_comm_overlap).
+export TP_COMM_OVERLAP=True
+
+# Execute of nvidia-smi boost-slider --vboost <value>
+export VBOOST_VALUE=1
+
+# Set MaxQ and MinEDP clocks
+export SET_MAXQ_CLK=0
+export MAXQ_CLK=""
+export SET_MINEDP_CLK=0
+export MINEDP_CLK=""
+
+# Set power limit
+export SET_POWER_CAP=0
+export POWER_CAP=""
+
+# Use CPU offloading (activations & weights).
+export CPU_OFFLOADING=False
+
+# Load the minimal number of samples
+export LOAD_MINIMAL_NUM_SAMPLES=0
+
+# Load distributed checkpoint directly on GPU
+export LOAD_DIRECTLY_ON_DEVICE=0
+
+
+
+# Extract system name
+export DGXSYSTEM=$(basename $(readlink -f ${BASH_SOURCE[0]}) | sed 's/^config_//' | sed 's/\.sh$//' )
+
+
+
+# Configure mlperf SYSJSON logging
+export MLPERF_SUBMITTER="Nebius"
+export MLPERF_SYSTEM_NAME="${DGXSYSTEM}"
+export MLPERF_STATUS="cloud"
+
+
+
+# Apply common settings
+source $(dirname ${BASH_SOURCE[0]})/config_common.sh
+
+# Apply FP8 settings
+source $(dirname ${BASH_SOURCE[0]})/config_fp8.sh
+
diff --git a/soperator/mlperf/gpt3-impl-4.0-nvidia/config_H200x8_NODEx8_default.sh b/soperator/mlperf/gpt3-impl-4.0-nvidia/config_H200x8_NODEx8_default.sh
@@ -0,0 +1 @@
+config_H200x8_NODEx8_TPx4_PPx8_VPx4_MINBSx128_MICBSx2.sh
diff --git a/soperator/mlperf/gpt3-impl-4.0-nvidia/run.sub b/soperator/mlperf/gpt3-impl-4.0-nvidia/run.sub
@@ -236,6 +236,7 @@ if [ -n "${CONTAINER_PRELOAD_SHARED_PATH}" ]; then
     CONT_FILE="${CONTAINER_PRELOAD_SHARED_PATH}/containers/${SLURM_JOBID}_$(basename ${CONT}).squashfs"
     mkdir -p "${CONTAINER_PRELOAD_SHARED_PATH}/containers"
     # Prepull container image to the shared filesystem
+    mkdir -p "${CONTAINER_PRELOAD_SHARED_PATH}/containers"
     srun --ntasks=1 enroot import --output ${CONT_FILE} docker://${CONT}
 else
     CONT_FILE=${CONT}
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		config_H200x8_NODEx64_TPx2_PPx8_VPx4_MINBSx128_MICBSx2.sh
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		config_H200x8_NODEx8_TPx4_PPx8_VPx4_MINBSx128_MICBSx2.sh