Skip to content

Commit

Permalink
add mistral-nemo-123b model train config
Browse files Browse the repository at this point in the history
  • Loading branch information
akoumpa committed Aug 2, 2024
1 parent 9906965 commit f5b7242
Showing 1 changed file with 231 additions and 0 deletions.
231 changes: 231 additions & 0 deletions launcher_scripts/conf/training/mistral/mistral_nemo_123b.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,231 @@
run:
name: mistral_7b
results_dir: ${base_results_dir}/${.name}
time_limit: "0-04:00:00"
dependency: "singleton"
trainer:
num_nodes: 16
devices: 8
accelerator: gpu
precision: bf16
logger: False # logger provided by exp_manager
enable_checkpointing: False
use_distributed_sampler: False
max_epochs: null
max_steps: 300000 # consumed_samples = global_step * global_batch_size
max_time: "05:23:30:00" # days:hours:minutes:seconds
log_every_n_steps: 10
val_check_interval: 2000
limit_val_batches: 32
limit_test_batches: 50
accumulate_grad_batches: 1
gradient_clip_val: 1.0
exp_manager:
explicit_log_dir: ${training.run.results_dir}/results
exp_dir: null
name: megatron_mistral
create_wandb_logger: False
wandb_logger_kwargs:
project: nemo_mistral_pretrain
name: ${training.run.name}
resume_if_exists: false
resume_ignore_no_checkpoint: true
create_checkpoint_callback: True
checkpoint_callback_params:
monitor: val_loss
save_top_k: 10
mode: min
always_save_nemo: False # saves nemo file during validation, not implemented for model parallel
save_nemo_on_train_end: False # not recommended when training large models on clusters with short time limits
filename: 'megatron_mistral--{val_loss:.2f}-{step}-{consumed_samples}'
model_parallel_size: ${multiply:${training.model.tensor_model_parallel_size}, ${training.model.pipeline_model_parallel_size}}
log_step_timing: True
step_timing_kwargs:
sync_cuda: True
buffer_size: 5

model:
mcore_gpt: true
micro_batch_size: 1
global_batch_size: 32
rampup_batch_size: null
tensor_model_parallel_size: 8
pipeline_model_parallel_size: 1
virtual_pipeline_model_parallel_size: null
encoder_seq_length: 4096
max_position_embeddings: 32768
num_layers: 88
hidden_size: 12288
ffn_hidden_size: 28672
num_attention_heads: 96
init_method_std: 0.02
use_scaled_init_method: true
hidden_dropout: 0.0
attention_dropout: 0.0
ffn_dropout: 0
kv_channels: 128
apply_query_key_layer_scaling: false
normalization: rmsnorm
layernorm_epsilon: 1.0e-05
do_layer_norm_weight_decay: false
make_vocab_size_divisible_by: 128
pre_process: true
post_process: true
persist_layer_norm: true
bias: false
activation: fast-swiglu
headscale: false
transformer_block_type: pre_ln
openai_gelu: false
normalize_attention_scores: true
position_embedding_type: rope
rotary_percentage: 1.0
attention_type: multihead
share_embeddings_and_output_weights: false
overlap_p2p_comm: false
batch_p2p_comm: true
seq_len_interpolation_factor: null
num_query_groups: 8
tokenizer:
library: huggingface
type: mistralai/Mistral-Large-Instruct-2407
use_fast: true
native_amp_init_scale: 4294967296
native_amp_growth_interval: 1000
hysteresis: 2
fp32_residual_connection: false
fp16_lm_cross_entropy: false
megatron_amp_O2: True
grad_allreduce_chunk_size_mb: 125
grad_div_ar_fusion: true
gradient_accumulation_fusion: false
bias_activation_fusion: false
bias_dropout_add_fusion: false
masked_softmax_fusion: true
get_attention_mask_from_fusion: true
seed: 1234
resume_from_checkpoint: null
use_cpu_initialization: false
onnx_safe: false
apex_transformer_log_level: 30
gradient_as_bucket_view: true
sync_batch_comm: false
activations_checkpoint_granularity: null
activations_checkpoint_method: null
activations_checkpoint_num_layers: null
num_micro_batches_with_partial_activation_checkpoints: null
activations_checkpoint_layers_per_pipeline: null
sequence_parallel: true
transformer_engine: true
fp8: false
fp8_e4m3: false
fp8_hybrid: true
fp8_margin: 0
fp8_interval: 1
fp8_amax_history_len: 1024
fp8_amax_compute_algo: max
reduce_amax: true
use_emha: false
ub_tp_comm_overlap: false
ub_tp_comm_overlap_cfg: null
use_flash_attention: false
nsys_profile:
enabled: false
start_step: 10
end_step: 10
ranks:
- 0
gen_shape: false
optim:
name: distributed_fused_adam
lr: 0.0001
weight_decay: 0.1
betas:
- 0.9
- 0.95
sched:
name: CosineAnnealing
warmup_steps: 107
constant_steps: 11873
min_lr: 1.0e-05
window_size:
- 4096
- 0
gc_interval: 0
precision: bf16
mcore_customization_config:
new_decoder_architecture: false
parallel_attention: true
data:
data_impl: mmap
splits_string: "99990,8,2"
seq_length: 2048
skip_warmup: true
num_workers: 2
dataloader_type: single
reset_position_ids: false
reset_attention_mask: false
eod_mask_loss: false
index_mapping_dir: null
data_prefix:
- .0333
- ${data_dir}/my-mistral_00_text_document
- .0333
- ${data_dir}/my-mistral_01_text_document
- .0333
- ${data_dir}/my-mistral_02_text_document
- .0333
- ${data_dir}/my-mistral_03_text_document
- .0333
- ${data_dir}/my-mistral_04_text_document
- .0333
- ${data_dir}/my-mistral_05_text_document
- .0333
- ${data_dir}/my-mistral_06_text_document
- .0333
- ${data_dir}/my-mistral_07_text_document
- .0333
- ${data_dir}/my-mistral_08_text_document
- .0333
- ${data_dir}/my-mistral_09_text_document
- .0333
- ${data_dir}/my-mistral_10_text_document
- .0333
- ${data_dir}/my-mistral_11_text_document
- .0333
- ${data_dir}/my-mistral_12_text_document
- .0333
- ${data_dir}/my-mistral_13_text_document
- .0333
- ${data_dir}/my-mistral_14_text_document
- .0333
- ${data_dir}/my-mistral_15_text_document
- .0333
- ${data_dir}/my-mistral_16_text_document
- .0333
- ${data_dir}/my-mistral_17_text_document
- .0333
- ${data_dir}/my-mistral_18_text_document
- .0333
- ${data_dir}/my-mistral_19_text_document
- .0333
- ${data_dir}/my-mistral_20_text_document
- .0333
- ${data_dir}/my-mistral_21_text_document
- .0333
- ${data_dir}/my-mistral_22_text_document
- .0333
- ${data_dir}/my-mistral_23_text_document
- .0333
- ${data_dir}/my-mistral_24_text_document
- .0333
- ${data_dir}/my-mistral_25_text_document
- .0333
- ${data_dir}/my-mistral_26_text_document
- .0333
- ${data_dir}/my-mistral_27_text_document
- .0333
- ${data_dir}/my-mistral_28_text_document
- .0334
- ${data_dir}/my-mistral_29_text_document

0 comments on commit f5b7242

Please sign in to comment.