-
Notifications
You must be signed in to change notification settings - Fork 5
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #93 from nebius/release/soperator
Release soperator 1.15.3
- Loading branch information
Showing
16 changed files
with
328 additions
and
15 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1 @@ | ||
1.15.2 | ||
1.15.3 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1 @@ | ||
4.0-16 | ||
4.0-20 |
86 changes: 86 additions & 0 deletions
86
...tor/mlperf/gpt3-impl-4.0-nvidia/config_H200x8_NODEx64_TPx2_PPx8_VPx4_MINBSx128_MICBSx2.sh
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,86 @@ | ||
#!/bin/bash | ||
|
||
# DL params | ||
export DGXNNODES="${DGXNNODES:=64}" # NODEx64 | ||
export TENSOR_MODEL_PARALLEL="${TENSOR_MODEL_PARALLEL:=2}" # TPx2 (training.model.tensor_model_parallel_size) | ||
export PIPELINE_MODEL_PARALLEL="${PIPELINE_MODEL_PARALLEL:=8}" # PPx8 (training.model.pipeline_model_parallel_size) | ||
export INTERLEAVED_PIPELINE="${INTERLEAVED_PIPELINE:=4}" # VPx4 | ||
export MINIBS="${MINIBS:=128}" # MINBSx128 | ||
export MICRO_BATCH_SIZE="${MICRO_BATCH_SIZE:=2}" # MICBSx2 | ||
|
||
# Check DL params | ||
# Rule: GBS % (DP * PP * MICRO_BATCH_SIZE) == 0 | ||
# This simplifies to MINIBS % PP == 0 | ||
if [[ $(($MINIBS % PIPELINE_MODEL_PARALLEL)) != 0 ]]; then | ||
echo "MINIBS should be divisble by PP" | ||
exit 1 | ||
fi | ||
|
||
|
||
|
||
# Slurm resource allocation | ||
export SBATCH_GPUS_PER_NODE="8" | ||
export SBATCH_MEM_PER_NODE="1200G" | ||
export SBATCH_TRES_PER_TASK="cpu=16" | ||
export SBATCH_DISTRIBUTION="block:block:block" | ||
export SLURM_CPU_BIND="verbose,none" | ||
#export EXCLUSIVE=1 | ||
|
||
# Use bindpcie CPU pinning | ||
export ENABLE_CPU_EXCLUSIVE=1 | ||
export ENABLE_IB_BINDING=1 | ||
|
||
|
||
|
||
|
||
# Job time limit | ||
export WALLTIME_MINUTES=1200 | ||
export WALLTIME=$(( (${NEXP:-1} * WALLTIME_MINUTES) )) | ||
|
||
|
||
|
||
# Use of userbuffer backend to overlap tensor-parallel communications with computes (training.model.ub_tp_comm_overlap). | ||
export TP_COMM_OVERLAP=True | ||
|
||
# Execute of nvidia-smi boost-slider --vboost <value> | ||
export VBOOST_VALUE=1 | ||
|
||
# Set MaxQ and MinEDP clocks | ||
export SET_MAXQ_CLK=0 | ||
export MAXQ_CLK="" | ||
export SET_MINEDP_CLK=0 | ||
export MINEDP_CLK="" | ||
|
||
# Set power limit | ||
export SET_POWER_CAP=0 | ||
export POWER_CAP="" | ||
|
||
# Use CPU offloading (activations & weights). | ||
export CPU_OFFLOADING=False | ||
|
||
# Load the minimal number of samples | ||
export LOAD_MINIMAL_NUM_SAMPLES=0 | ||
|
||
# Load distributed checkpoint directly on GPU | ||
export LOAD_DIRECTLY_ON_DEVICE=0 | ||
|
||
|
||
|
||
# Extract system name | ||
export DGXSYSTEM=$(basename $(readlink -f ${BASH_SOURCE[0]}) | sed 's/^config_//' | sed 's/\.sh$//' ) | ||
|
||
|
||
|
||
# Configure mlperf SYSJSON logging | ||
export MLPERF_SUBMITTER="Nebius" | ||
export MLPERF_SYSTEM_NAME="${DGXSYSTEM}" | ||
export MLPERF_STATUS="cloud" | ||
|
||
|
||
|
||
# Apply common settings | ||
source $(dirname ${BASH_SOURCE[0]})/config_common.sh | ||
|
||
# Apply FP8 settings | ||
source $(dirname ${BASH_SOURCE[0]})/config_fp8.sh | ||
|
1 change: 1 addition & 0 deletions
1
soperator/mlperf/gpt3-impl-4.0-nvidia/config_H200x8_NODEx64_default.sh
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
config_H200x8_NODEx64_TPx2_PPx8_VPx4_MINBSx128_MICBSx2.sh |
86 changes: 86 additions & 0 deletions
86
...ator/mlperf/gpt3-impl-4.0-nvidia/config_H200x8_NODEx8_TPx4_PPx8_VPx4_MINBSx128_MICBSx2.sh
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,86 @@ | ||
#!/bin/bash | ||
|
||
# DL params | ||
export DGXNNODES="${DGXNNODES:=8}" # NODEx8 | ||
export TENSOR_MODEL_PARALLEL="${TENSOR_MODEL_PARALLEL:=4}" # TPx4 (training.model.tensor_model_parallel_size) | ||
export PIPELINE_MODEL_PARALLEL="${PIPELINE_MODEL_PARALLEL:=8}" # PPx8 (training.model.pipeline_model_parallel_size) | ||
export INTERLEAVED_PIPELINE="${INTERLEAVED_PIPELINE:=4}" # VPx4 | ||
export MINIBS="${MINIBS:=128}" # MINBSx128 | ||
export MICRO_BATCH_SIZE="${MICRO_BATCH_SIZE:=2}" # MICBSx2 | ||
|
||
# Check DL params | ||
# Rule: GBS % (DP * PP * MICRO_BATCH_SIZE) == 0 | ||
# This simplifies to MINIBS % PP == 0 | ||
if [[ $(($MINIBS % PIPELINE_MODEL_PARALLEL)) != 0 ]]; then | ||
echo "MINIBS should be divisble by PP" | ||
exit 1 | ||
fi | ||
|
||
|
||
|
||
# Slurm resource allocation | ||
export SBATCH_GPUS_PER_NODE="8" | ||
export SBATCH_MEM_PER_NODE="1200G" | ||
export SBATCH_TRES_PER_TASK="cpu=16" | ||
export SBATCH_DISTRIBUTION="block:block:block" | ||
export SLURM_CPU_BIND="verbose,none" | ||
#export EXCLUSIVE=1 | ||
|
||
# Use bindpcie CPU pinning | ||
export ENABLE_CPU_EXCLUSIVE=1 | ||
export ENABLE_IB_BINDING=1 | ||
|
||
|
||
|
||
|
||
# Job time limit | ||
export WALLTIME_MINUTES=1200 | ||
export WALLTIME=$(( (${NEXP:-1} * WALLTIME_MINUTES) )) | ||
|
||
|
||
|
||
# Use of userbuffer backend to overlap tensor-parallel communications with computes (training.model.ub_tp_comm_overlap). | ||
export TP_COMM_OVERLAP=True | ||
|
||
# Execute of nvidia-smi boost-slider --vboost <value> | ||
export VBOOST_VALUE=1 | ||
|
||
# Set MaxQ and MinEDP clocks | ||
export SET_MAXQ_CLK=0 | ||
export MAXQ_CLK="" | ||
export SET_MINEDP_CLK=0 | ||
export MINEDP_CLK="" | ||
|
||
# Set power limit | ||
export SET_POWER_CAP=0 | ||
export POWER_CAP="" | ||
|
||
# Use CPU offloading (activations & weights). | ||
export CPU_OFFLOADING=False | ||
|
||
# Load the minimal number of samples | ||
export LOAD_MINIMAL_NUM_SAMPLES=0 | ||
|
||
# Load distributed checkpoint directly on GPU | ||
export LOAD_DIRECTLY_ON_DEVICE=0 | ||
|
||
|
||
|
||
# Extract system name | ||
export DGXSYSTEM=$(basename $(readlink -f ${BASH_SOURCE[0]}) | sed 's/^config_//' | sed 's/\.sh$//' ) | ||
|
||
|
||
|
||
# Configure mlperf SYSJSON logging | ||
export MLPERF_SUBMITTER="Nebius" | ||
export MLPERF_SYSTEM_NAME="${DGXSYSTEM}" | ||
export MLPERF_STATUS="cloud" | ||
|
||
|
||
|
||
# Apply common settings | ||
source $(dirname ${BASH_SOURCE[0]})/config_common.sh | ||
|
||
# Apply FP8 settings | ||
source $(dirname ${BASH_SOURCE[0]})/config_fp8.sh | ||
|
86 changes: 86 additions & 0 deletions
86
...tor/mlperf/gpt3-impl-4.0-nvidia/config_H200x8_NODEx8_TPx8_PPx8_VPx4_MINBSx3072_MICBSx1.sh
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,86 @@ | ||
#!/bin/bash | ||
|
||
# DL params | ||
export DGXNNODES="${DGXNNODES:=8}" # NODEx8 | ||
export TENSOR_MODEL_PARALLEL="${TENSOR_MODEL_PARALLEL:=8}" # TPx8 (training.model.tensor_model_parallel_size) | ||
export PIPELINE_MODEL_PARALLEL="${PIPELINE_MODEL_PARALLEL:=8}" # PPx8 (training.model.pipeline_model_parallel_size) | ||
export INTERLEAVED_PIPELINE="${INTERLEAVED_PIPELINE:=4}" # VPx4 | ||
export MINIBS="${MINIBS:=3072}" # MINBSx3072 | ||
export MICRO_BATCH_SIZE="${MICRO_BATCH_SIZE:=1}" # MICBSx1 | ||
|
||
# Check DL params | ||
# Rule: GBS % (DP * PP * MICRO_BATCH_SIZE) == 0 | ||
# This simplifies to MINIBS % PP == 0 | ||
if [[ $(($MINIBS % PIPELINE_MODEL_PARALLEL)) != 0 ]]; then | ||
echo "MINIBS should be divisble by PP" | ||
exit 1 | ||
fi | ||
|
||
|
||
|
||
# Slurm resource allocation | ||
export SBATCH_GPUS_PER_NODE="8" | ||
export SBATCH_MEM_PER_NODE="1200G" | ||
export SBATCH_TRES_PER_TASK="cpu=16" | ||
export SBATCH_DISTRIBUTION="block:block:block" | ||
export SLURM_CPU_BIND="verbose,none" | ||
#export EXCLUSIVE=1 | ||
|
||
# Use bindpcie CPU pinning | ||
export ENABLE_CPU_EXCLUSIVE=1 | ||
export ENABLE_IB_BINDING=1 | ||
|
||
|
||
|
||
|
||
# Job time limit | ||
export WALLTIME_MINUTES=1200 | ||
export WALLTIME=$(( (${NEXP:-1} * WALLTIME_MINUTES) )) | ||
|
||
|
||
|
||
# Use of userbuffer backend to overlap tensor-parallel communications with computes (training.model.ub_tp_comm_overlap). | ||
export TP_COMM_OVERLAP=True | ||
|
||
# Execute of nvidia-smi boost-slider --vboost <value> | ||
export VBOOST_VALUE=1 | ||
|
||
# Set MaxQ and MinEDP clocks | ||
export SET_MAXQ_CLK=0 | ||
export MAXQ_CLK="" | ||
export SET_MINEDP_CLK=0 | ||
export MINEDP_CLK="" | ||
|
||
# Set power limit | ||
export SET_POWER_CAP=0 | ||
export POWER_CAP="" | ||
|
||
# Use CPU offloading (activations & weights). | ||
export CPU_OFFLOADING=False | ||
|
||
# Load the minimal number of samples | ||
export LOAD_MINIMAL_NUM_SAMPLES=0 | ||
|
||
# Load distributed checkpoint directly on GPU | ||
export LOAD_DIRECTLY_ON_DEVICE=0 | ||
|
||
|
||
|
||
# Extract system name | ||
export DGXSYSTEM=$(basename $(readlink -f ${BASH_SOURCE[0]}) | sed 's/^config_//' | sed 's/\.sh$//' ) | ||
|
||
|
||
|
||
# Configure mlperf SYSJSON logging | ||
export MLPERF_SUBMITTER="Nebius" | ||
export MLPERF_SYSTEM_NAME="${DGXSYSTEM}" | ||
export MLPERF_STATUS="cloud" | ||
|
||
|
||
|
||
# Apply common settings | ||
source $(dirname ${BASH_SOURCE[0]})/config_common.sh | ||
|
||
# Apply FP8 settings | ||
source $(dirname ${BASH_SOURCE[0]})/config_fp8.sh | ||
|
1 change: 1 addition & 0 deletions
1
soperator/mlperf/gpt3-impl-4.0-nvidia/config_H200x8_NODEx8_default.sh
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
config_H200x8_NODEx8_TPx4_PPx8_VPx4_MINBSx128_MICBSx2.sh |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.