Skip to content

Commit

Permalink
update llama2/3 configs
Browse files Browse the repository at this point in the history
Signed-off-by: Sangkug Lym <[email protected]>
  • Loading branch information
erhoo82 committed Aug 2, 2024
1 parent 8325688 commit fee1762
Show file tree
Hide file tree
Showing 12 changed files with 190 additions and 3 deletions.
19 changes: 19 additions & 0 deletions examples/training/llama/h100/llama3_405b_bf16.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
#!/bin/bash

#Users should setup their cluster type in /launcher_scripts/conf/config.yaml
NEMO_FRAMEWORK_LAUNCHER_DIR=${NEMO_FRAMEWORK_LAUNCHER_DIR:-"/opt/NeMo-Framework-Launcher"}
DATA_DIR=${DATA_DIR}
TOK_PATH=${TOK_PATH}

HYDRA_FULL_ERROR=1 python3 ${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts/main.py \
training=llama/llama3_405b \
stages=[training] \
data_dir=${DATA_DIR} \
launcher_scripts_path=${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts \
base_results_dir=${NEMO_FRAMEWORK_LAUNCHER_DIR}/results \
training.run.name="llama3_1_405b_bf16" \
training.run.time_limit=0:30:00 \
training.trainer.num_nodes=72 \
training.model.global_batch_size=252 \
training.model.tokenizer.model=${TOK_PATH} \
+training.model.optim.grad_sync_dtype=bf16 \
22 changes: 22 additions & 0 deletions examples/training/llama/h100/llama3_405b_fp8.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
#!/bin/bash

#Users should setup their cluster type in /launcher_scripts/conf/config.yaml
NEMO_FRAMEWORK_LAUNCHER_DIR=${NEMO_FRAMEWORK_LAUNCHER_DIR:-"/opt/NeMo-Framework-Launcher"}
DATA_DIR=${DATA_DIR}
TOK_PATH=${TOK_PATH}

HYDRA_FULL_ERROR=1 python3 ${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts/main.py \
training=llama/llama3_405b \
stages=[training] \
data_dir=${DATA_DIR} \
launcher_scripts_path=${NEMO_FRAMEWORK_LAUNCHER_DIR}/launcher_scripts \
base_results_dir=${NEMO_FRAMEWORK_LAUNCHER_DIR}/results \
training.run.name="llama3_1_405b_fp8" \
training.run.time_limit=0:30:00 \
training.trainer.num_nodes=72 \
training.model.global_batch_size=252 \
training.model.fp8=True \
training.model.fp8_hybrid=True \
training.model.tokenizer.model=${TOK_PATH} \
+training.model.optim.grad_sync_dtype=bf16 \
training/[email protected]_tp_comm_overlap_cfg=ub_cfg_h100_fp8_h16384_tp8_cp2_mbs1_seqlen8192 \
2 changes: 2 additions & 0 deletions launcher_scripts/conf/training/llama/llama2_13b.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,8 @@ model:
num_micro_batches_with_partial_activation_checkpoints: 0
activations_checkpoint_layers_per_pipeline: 0
sequence_parallel: false
deterministic_mode: false
cross_entropy_loss_fusion: true
transformer_engine: true
fp8: false
fp8_e4m3: false
Expand Down
6 changes: 5 additions & 1 deletion launcher_scripts/conf/training/llama/llama2_70b.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ model:
rampup_batch_size: null
tensor_model_parallel_size: 4
pipeline_model_parallel_size: 4
virtual_pipeline_model_parallel_size: 20
virtual_pipeline_model_parallel_size: 5
encoder_seq_length: 4096
max_position_embeddings: 4096
num_layers: 80
Expand Down Expand Up @@ -124,6 +124,10 @@ model:
num_micro_batches_with_partial_activation_checkpoints: 0
activations_checkpoint_layers_per_pipeline: 0
sequence_parallel: true
defer_embedding_wgrad_compute: true
wgrad_deferral_limit: 22
cross_entropy_loss_fusion: true
deterministic_mode: false
transformer_engine: true
fp8: false
fp8_e4m3: false
Expand Down
2 changes: 2 additions & 0 deletions launcher_scripts/conf/training/llama/llama2_7b.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,8 @@ model:
num_micro_batches_with_partial_activation_checkpoints: null
activations_checkpoint_layers_per_pipeline: null
sequence_parallel: false
deterministic_mode: false
cross_entropy_loss_fusion: true

## Transformer Engine
transformer_engine: true
Expand Down
5 changes: 4 additions & 1 deletion launcher_scripts/conf/training/llama/llama3_1_405b.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
defaults:
- _self_
- optional [email protected]_tp_comm_overlap_cfg: ub_cfg_h100_h8192_tp4_mbs1_seqlen8192
- optional [email protected]_tp_comm_overlap_cfg: ub_cfg_h100_bf16_h16384_tp8_cp2_mbs1_seqlen8192

hydra:
searchpath:
Expand Down Expand Up @@ -124,6 +124,9 @@ model:
num_micro_batches_with_partial_activation_checkpoints: null
activations_checkpoint_layers_per_pipeline: null
sequence_parallel: true
defer_embedding_wgrad_compute: true
wgrad_deferral_limit: 50
deterministic_mode: false

## Transformer Engine
transformer_engine: true
Expand Down
3 changes: 3 additions & 0 deletions launcher_scripts/conf/training/llama/llama3_1_70b.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,9 @@ model:
num_micro_batches_with_partial_activation_checkpoints: null
activations_checkpoint_layers_per_pipeline: null
sequence_parallel: true
defer_embedding_wgrad_compute: true
wgrad_deferral_limit: 22
deterministic_mode: false

## Transformer Engine
transformer_engine: true
Expand Down
2 changes: 2 additions & 0 deletions launcher_scripts/conf/training/llama/llama3_1_8b.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,8 @@ model:
num_micro_batches_with_partial_activation_checkpoints: null
activations_checkpoint_layers_per_pipeline: null
sequence_parallel: false
sequence_parallel: false
deterministic_mode: false

## Transformer Engine
transformer_engine: true
Expand Down
5 changes: 4 additions & 1 deletion launcher_scripts/conf/training/llama/llama3_70b.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ model:
rampup_batch_size: null
tensor_model_parallel_size: 4
pipeline_model_parallel_size: 4
virtual_pipeline_model_parallel_size: 10
virtual_pipeline_model_parallel_size: 5
context_parallel_size: 2
encoder_seq_length: 8192
max_position_embeddings: 8192
Expand Down Expand Up @@ -123,6 +123,9 @@ model:
num_micro_batches_with_partial_activation_checkpoints: null
activations_checkpoint_layers_per_pipeline: null
sequence_parallel: true
defer_embedding_wgrad_compute: true
wgrad_deferral_limit: 22
deterministic_mode: false

## Transformer Engine
transformer_engine: true
Expand Down
1 change: 1 addition & 0 deletions launcher_scripts/conf/training/llama/llama3_8b.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,7 @@ model:
num_micro_batches_with_partial_activation_checkpoints: null
activations_checkpoint_layers_per_pipeline: null
sequence_parallel: false
deterministic_mode: false

## Transformer Engine
transformer_engine: true
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
# UB communicator configurations
# Model configs: H100/405B/TP8/CP2/MBS1/SeqLen8K/FP8

# Bulk overlap with AllGather / ReduceScatter
qkv_dgrad:
method: bulk
num_sm: 2
cga_size: 2
set_sm_margin: 0

qkv_wgrad:
method: bulk
num_sm: 24
cga_size: 2
set_sm_margin: 0

fc1_dgrad:
method: bulk
num_sm: 2
cga_size: 2
set_sm_margin: 0

fc1_wgrad:
method: bulk
num_sm: 2
cga_size: 2
set_sm_margin: 0

## Ring
qkv_fprop:
method: ring_exchange
aggregate: 1

proj_dgrad:
method: ring_exchange
aggregate: 1

fc1_fprop:
method: ring_exchange
aggregate: 1

fc2_dgrad:
method: ring_exchange
aggregate: 1

# Chunked
proj_fprop:
method: pipeline
num_sm: 24
cga_size: 2
num_splits: 4
set_sm_margin: 1
fp8_buf: 0
atomic_gemm: 0

fc2_fprop:
method: pipeline
num_sm: 8
cga_size: 2
num_splits: 4
set_sm_margin: 1
fp8_buf: 0
atomic_gemm: 0
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
# UB communicator configurations
# Model configs: H100/405B/TP8/CP2/MBS1/SeqLen8K/FP8

# Bulk overlap with AllGather / ReduceScatter
qkv_dgrad:
method: bulk
num_sm: 2
cga_size: 2
set_sm_margin: 0

qkv_wgrad:
method: bulk
num_sm: 24
cga_size: 2
set_sm_margin: 0

fc1_dgrad:
method: bulk
num_sm: 2
cga_size: 2
set_sm_margin: 0

fc1_wgrad:
method: bulk
num_sm: 2
cga_size: 2
set_sm_margin: 0

## Ring
qkv_fprop:
method: ring_exchange
aggregate: 1

proj_dgrad:
method: ring_exchange
aggregate: 1

fc1_fprop:
method: ring_exchange
aggregate: 1

fc2_dgrad:
method: ring_exchange
aggregate: 0

# Chunked
proj_fprop:
method: pipeline
num_sm: 24
cga_size: 2
num_splits: 4
set_sm_margin: 1
fp8_buf: 1
atomic_gemm: 0

fc2_fprop:
method: pipeline
num_sm: 16
cga_size: 2
num_splits: 4
set_sm_margin: 1
fp8_buf: 1
atomic_gemm: 0

0 comments on commit fee1762

Please sign in to comment.