Skip to content

Commit

Permalink
Merge pull request #93 from nebius/release/soperator
Browse files Browse the repository at this point in the history
Release soperator 1.15.3
  • Loading branch information
asteny authored Nov 20, 2024
2 parents d4a95b6 + 090ec6f commit 8584e20
Show file tree
Hide file tree
Showing 16 changed files with 328 additions and 15 deletions.
2 changes: 1 addition & 1 deletion soperator/VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
1.15.2
1.15.3
1 change: 1 addition & 0 deletions soperator/installations/example/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -255,6 +255,7 @@ module "slurm" {
nccl_benchmark_enable = var.nccl_benchmark_enable
nccl_benchmark_schedule = var.nccl_benchmark_schedule
nccl_benchmark_min_threshold = var.nccl_benchmark_min_threshold
nccl_use_infiniband = var.nccl_use_infiniband

telemetry_enabled = var.telemetry_enabled
telemetry_grafana_admin_password = var.telemetry_grafana_admin_password
Expand Down
11 changes: 8 additions & 3 deletions soperator/installations/example/terraform.tfvars
Original file line number Diff line number Diff line change
Expand Up @@ -168,7 +168,7 @@ slurm_cluster_name = "my-amazing-slurm"

# Version of soperator.
# ---
slurm_operator_version = "1.15.2"
slurm_operator_version = "1.15.3"

# Type of the Slurm partition config. Could be either `default` or `custom`.
# By default, "default".
Expand Down Expand Up @@ -357,9 +357,14 @@ slurm_shared_memory_size_gibibytes = 256
# nccl_benchmark_enable = "0 */3 * * *"

# Minimal threshold of NCCL benchmark for GPU performance to be considered as acceptable.
# By default, 420.
# By default, 45.
# ---
# nccl_benchmark_min_threshold = 420
# nccl_benchmark_min_threshold = 45

# Use infiniband defines using NCCL_P2P_DISABLE=1 NCCL_SHM_DISABLE=1 NCCL_ALGO=Ring env variables for test.
# By default, true
# ---
# nccl_use_infiniband = true

# endregion NCCL benchmark

Expand Down
10 changes: 8 additions & 2 deletions soperator/installations/example/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -504,10 +504,16 @@ variable "nccl_benchmark_schedule" {
variable "nccl_benchmark_min_threshold" {
description = "Minimal threshold of NCCL benchmark for GPU performance to be considered as acceptable."
type = number
default = 420
default = 45
}

# region NCCL benchmark
variable "nccl_use_infiniband" {
description = "Use infiniband defines using NCCL_P2P_DISABLE=1 NCCL_SHM_DISABLE=1 NCCL_ALGO=Ring env variables for test."
type = bool
default = true
}

# endregion NCCL benchmark

# region Telemetry

Expand Down
15 changes: 15 additions & 0 deletions soperator/mlperf/gpt3-impl-4.0-nvidia/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,21 @@ RUN rm -rf /opt/hpcx/nccl_rdma_sharp_plugin && \
RUN pip install huggingface_hub==0.23.2
RUN pip install -v "transformers<=4.40.2"

## Reinstall NCCL to the latest version
#RUN wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb
#RUN dpkg -i cuda-keyring_1.1-1_all.deb
#RUN apt-get update
#RUN apt install libnccl2=2.23.4-1+cuda12.4 libnccl-dev=2.23.4-1+cuda12.4

## Install NCCL profiler plugin
#RUN git clone https://github.com/NVIDIA/nccl && \
# cd nccl && \
# git checkout v2.23.4-1 && \
# cd ext-profiler/example && \
# make && \
# cp libnccl-profiler.so /usr/lib/x86_64-linux-gnu/


# Benchmark code
WORKDIR /workspace/llm

Expand Down
2 changes: 1 addition & 1 deletion soperator/mlperf/gpt3-impl-4.0-nvidia/VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
4.0-16
4.0-20
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
#!/bin/bash

# DL params
export DGXNNODES="${DGXNNODES:=64}" # NODEx64
export TENSOR_MODEL_PARALLEL="${TENSOR_MODEL_PARALLEL:=2}" # TPx2 (training.model.tensor_model_parallel_size)
export PIPELINE_MODEL_PARALLEL="${PIPELINE_MODEL_PARALLEL:=8}" # PPx8 (training.model.pipeline_model_parallel_size)
export INTERLEAVED_PIPELINE="${INTERLEAVED_PIPELINE:=4}" # VPx4
export MINIBS="${MINIBS:=128}" # MINBSx128
export MICRO_BATCH_SIZE="${MICRO_BATCH_SIZE:=2}" # MICBSx2

# Check DL params
# Rule: GBS % (DP * PP * MICRO_BATCH_SIZE) == 0
# This simplifies to MINIBS % PP == 0
if [[ $(($MINIBS % PIPELINE_MODEL_PARALLEL)) != 0 ]]; then
echo "MINIBS should be divisble by PP"
exit 1
fi



# Slurm resource allocation
export SBATCH_GPUS_PER_NODE="8"
export SBATCH_MEM_PER_NODE="1200G"
export SBATCH_TRES_PER_TASK="cpu=16"
export SBATCH_DISTRIBUTION="block:block:block"
export SLURM_CPU_BIND="verbose,none"
#export EXCLUSIVE=1

# Use bindpcie CPU pinning
export ENABLE_CPU_EXCLUSIVE=1
export ENABLE_IB_BINDING=1




# Job time limit
export WALLTIME_MINUTES=1200
export WALLTIME=$(( (${NEXP:-1} * WALLTIME_MINUTES) ))



# Use of userbuffer backend to overlap tensor-parallel communications with computes (training.model.ub_tp_comm_overlap).
export TP_COMM_OVERLAP=True

# Execute of nvidia-smi boost-slider --vboost <value>
export VBOOST_VALUE=1

# Set MaxQ and MinEDP clocks
export SET_MAXQ_CLK=0
export MAXQ_CLK=""
export SET_MINEDP_CLK=0
export MINEDP_CLK=""

# Set power limit
export SET_POWER_CAP=0
export POWER_CAP=""

# Use CPU offloading (activations & weights).
export CPU_OFFLOADING=False

# Load the minimal number of samples
export LOAD_MINIMAL_NUM_SAMPLES=0

# Load distributed checkpoint directly on GPU
export LOAD_DIRECTLY_ON_DEVICE=0



# Extract system name
export DGXSYSTEM=$(basename $(readlink -f ${BASH_SOURCE[0]}) | sed 's/^config_//' | sed 's/\.sh$//' )



# Configure mlperf SYSJSON logging
export MLPERF_SUBMITTER="Nebius"
export MLPERF_SYSTEM_NAME="${DGXSYSTEM}"
export MLPERF_STATUS="cloud"



# Apply common settings
source $(dirname ${BASH_SOURCE[0]})/config_common.sh

# Apply FP8 settings
source $(dirname ${BASH_SOURCE[0]})/config_fp8.sh

Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
#!/bin/bash

# DL params
export DGXNNODES="${DGXNNODES:=8}" # NODEx8
export TENSOR_MODEL_PARALLEL="${TENSOR_MODEL_PARALLEL:=4}" # TPx4 (training.model.tensor_model_parallel_size)
export PIPELINE_MODEL_PARALLEL="${PIPELINE_MODEL_PARALLEL:=8}" # PPx8 (training.model.pipeline_model_parallel_size)
export INTERLEAVED_PIPELINE="${INTERLEAVED_PIPELINE:=4}" # VPx4
export MINIBS="${MINIBS:=128}" # MINBSx128
export MICRO_BATCH_SIZE="${MICRO_BATCH_SIZE:=2}" # MICBSx2

# Check DL params
# Rule: GBS % (DP * PP * MICRO_BATCH_SIZE) == 0
# This simplifies to MINIBS % PP == 0
if [[ $(($MINIBS % PIPELINE_MODEL_PARALLEL)) != 0 ]]; then
echo "MINIBS should be divisble by PP"
exit 1
fi



# Slurm resource allocation
export SBATCH_GPUS_PER_NODE="8"
export SBATCH_MEM_PER_NODE="1200G"
export SBATCH_TRES_PER_TASK="cpu=16"
export SBATCH_DISTRIBUTION="block:block:block"
export SLURM_CPU_BIND="verbose,none"
#export EXCLUSIVE=1

# Use bindpcie CPU pinning
export ENABLE_CPU_EXCLUSIVE=1
export ENABLE_IB_BINDING=1




# Job time limit
export WALLTIME_MINUTES=1200
export WALLTIME=$(( (${NEXP:-1} * WALLTIME_MINUTES) ))



# Use of userbuffer backend to overlap tensor-parallel communications with computes (training.model.ub_tp_comm_overlap).
export TP_COMM_OVERLAP=True

# Execute of nvidia-smi boost-slider --vboost <value>
export VBOOST_VALUE=1

# Set MaxQ and MinEDP clocks
export SET_MAXQ_CLK=0
export MAXQ_CLK=""
export SET_MINEDP_CLK=0
export MINEDP_CLK=""

# Set power limit
export SET_POWER_CAP=0
export POWER_CAP=""

# Use CPU offloading (activations & weights).
export CPU_OFFLOADING=False

# Load the minimal number of samples
export LOAD_MINIMAL_NUM_SAMPLES=0

# Load distributed checkpoint directly on GPU
export LOAD_DIRECTLY_ON_DEVICE=0



# Extract system name
export DGXSYSTEM=$(basename $(readlink -f ${BASH_SOURCE[0]}) | sed 's/^config_//' | sed 's/\.sh$//' )



# Configure mlperf SYSJSON logging
export MLPERF_SUBMITTER="Nebius"
export MLPERF_SYSTEM_NAME="${DGXSYSTEM}"
export MLPERF_STATUS="cloud"



# Apply common settings
source $(dirname ${BASH_SOURCE[0]})/config_common.sh

# Apply FP8 settings
source $(dirname ${BASH_SOURCE[0]})/config_fp8.sh

Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
#!/bin/bash

# DL params
export DGXNNODES="${DGXNNODES:=8}" # NODEx8
export TENSOR_MODEL_PARALLEL="${TENSOR_MODEL_PARALLEL:=8}" # TPx8 (training.model.tensor_model_parallel_size)
export PIPELINE_MODEL_PARALLEL="${PIPELINE_MODEL_PARALLEL:=8}" # PPx8 (training.model.pipeline_model_parallel_size)
export INTERLEAVED_PIPELINE="${INTERLEAVED_PIPELINE:=4}" # VPx4
export MINIBS="${MINIBS:=3072}" # MINBSx3072
export MICRO_BATCH_SIZE="${MICRO_BATCH_SIZE:=1}" # MICBSx1

# Check DL params
# Rule: GBS % (DP * PP * MICRO_BATCH_SIZE) == 0
# This simplifies to MINIBS % PP == 0
if [[ $(($MINIBS % PIPELINE_MODEL_PARALLEL)) != 0 ]]; then
echo "MINIBS should be divisble by PP"
exit 1
fi



# Slurm resource allocation
export SBATCH_GPUS_PER_NODE="8"
export SBATCH_MEM_PER_NODE="1200G"
export SBATCH_TRES_PER_TASK="cpu=16"
export SBATCH_DISTRIBUTION="block:block:block"
export SLURM_CPU_BIND="verbose,none"
#export EXCLUSIVE=1

# Use bindpcie CPU pinning
export ENABLE_CPU_EXCLUSIVE=1
export ENABLE_IB_BINDING=1




# Job time limit
export WALLTIME_MINUTES=1200
export WALLTIME=$(( (${NEXP:-1} * WALLTIME_MINUTES) ))



# Use of userbuffer backend to overlap tensor-parallel communications with computes (training.model.ub_tp_comm_overlap).
export TP_COMM_OVERLAP=True

# Execute of nvidia-smi boost-slider --vboost <value>
export VBOOST_VALUE=1

# Set MaxQ and MinEDP clocks
export SET_MAXQ_CLK=0
export MAXQ_CLK=""
export SET_MINEDP_CLK=0
export MINEDP_CLK=""

# Set power limit
export SET_POWER_CAP=0
export POWER_CAP=""

# Use CPU offloading (activations & weights).
export CPU_OFFLOADING=False

# Load the minimal number of samples
export LOAD_MINIMAL_NUM_SAMPLES=0

# Load distributed checkpoint directly on GPU
export LOAD_DIRECTLY_ON_DEVICE=0



# Extract system name
export DGXSYSTEM=$(basename $(readlink -f ${BASH_SOURCE[0]}) | sed 's/^config_//' | sed 's/\.sh$//' )



# Configure mlperf SYSJSON logging
export MLPERF_SUBMITTER="Nebius"
export MLPERF_SYSTEM_NAME="${DGXSYSTEM}"
export MLPERF_STATUS="cloud"



# Apply common settings
source $(dirname ${BASH_SOURCE[0]})/config_common.sh

# Apply FP8 settings
source $(dirname ${BASH_SOURCE[0]})/config_fp8.sh

1 change: 1 addition & 0 deletions soperator/mlperf/gpt3-impl-4.0-nvidia/run.sub
Original file line number Diff line number Diff line change
Expand Up @@ -236,6 +236,7 @@ if [ -n "${CONTAINER_PRELOAD_SHARED_PATH}" ]; then
CONT_FILE="${CONTAINER_PRELOAD_SHARED_PATH}/containers/${SLURM_JOBID}_$(basename ${CONT}).squashfs"
mkdir -p "${CONTAINER_PRELOAD_SHARED_PATH}/containers"
# Prepull container image to the shared filesystem
mkdir -p "${CONTAINER_PRELOAD_SHARED_PATH}/containers"
srun --ntasks=1 enroot import --output ${CONT_FILE} docker://${CONT}
else
CONT_FILE=${CONT}
Expand Down
Loading

0 comments on commit 8584e20

Please sign in to comment.