diff --git a/launcher_scripts/conf/training/mistral/mistral_nemo_123b.yaml b/launcher_scripts/conf/training/mistral/mistral_nemo_123b.yaml new file mode 100644 index 000000000..c7e6e5aaf --- /dev/null +++ b/launcher_scripts/conf/training/mistral/mistral_nemo_123b.yaml @@ -0,0 +1,231 @@ +run: + name: mistral_7b + results_dir: ${base_results_dir}/${.name} + time_limit: "0-04:00:00" + dependency: "singleton" +trainer: + num_nodes: 16 + devices: 8 + accelerator: gpu + precision: bf16 + logger: False # logger provided by exp_manager + enable_checkpointing: False + use_distributed_sampler: False + max_epochs: null + max_steps: 300000 # consumed_samples = global_step * global_batch_size + max_time: "05:23:30:00" # days:hours:minutes:seconds + log_every_n_steps: 10 + val_check_interval: 2000 + limit_val_batches: 32 + limit_test_batches: 50 + accumulate_grad_batches: 1 + gradient_clip_val: 1.0 +exp_manager: + explicit_log_dir: ${training.run.results_dir}/results + exp_dir: null + name: megatron_mistral + create_wandb_logger: False + wandb_logger_kwargs: + project: nemo_mistral_pretrain + name: ${training.run.name} + resume_if_exists: false + resume_ignore_no_checkpoint: true + create_checkpoint_callback: True + checkpoint_callback_params: + monitor: val_loss + save_top_k: 10 + mode: min + always_save_nemo: False # saves nemo file during validation, not implemented for model parallel + save_nemo_on_train_end: False # not recommended when training large models on clusters with short time limits + filename: 'megatron_mistral--{val_loss:.2f}-{step}-{consumed_samples}' + model_parallel_size: ${multiply:${training.model.tensor_model_parallel_size}, ${training.model.pipeline_model_parallel_size}} + log_step_timing: True + step_timing_kwargs: + sync_cuda: True + buffer_size: 5 + +model: + mcore_gpt: true + micro_batch_size: 1 + global_batch_size: 32 + rampup_batch_size: null + tensor_model_parallel_size: 8 + pipeline_model_parallel_size: 1 + virtual_pipeline_model_parallel_size: null + encoder_seq_length: 4096 + max_position_embeddings: 32768 + num_layers: 88 + hidden_size: 12288 + ffn_hidden_size: 28672 + num_attention_heads: 96 + init_method_std: 0.02 + use_scaled_init_method: true + hidden_dropout: 0.0 + attention_dropout: 0.0 + ffn_dropout: 0 + kv_channels: 128 + apply_query_key_layer_scaling: false + normalization: rmsnorm + layernorm_epsilon: 1.0e-05 + do_layer_norm_weight_decay: false + make_vocab_size_divisible_by: 128 + pre_process: true + post_process: true + persist_layer_norm: true + bias: false + activation: fast-swiglu + headscale: false + transformer_block_type: pre_ln + openai_gelu: false + normalize_attention_scores: true + position_embedding_type: rope + rotary_percentage: 1.0 + attention_type: multihead + share_embeddings_and_output_weights: false + overlap_p2p_comm: false + batch_p2p_comm: true + seq_len_interpolation_factor: null + num_query_groups: 8 + tokenizer: + library: huggingface + type: mistralai/Mistral-Large-Instruct-2407 + use_fast: true + native_amp_init_scale: 4294967296 + native_amp_growth_interval: 1000 + hysteresis: 2 + fp32_residual_connection: false + fp16_lm_cross_entropy: false + megatron_amp_O2: True + grad_allreduce_chunk_size_mb: 125 + grad_div_ar_fusion: true + gradient_accumulation_fusion: false + bias_activation_fusion: false + bias_dropout_add_fusion: false + masked_softmax_fusion: true + get_attention_mask_from_fusion: true + seed: 1234 + resume_from_checkpoint: null + use_cpu_initialization: false + onnx_safe: false + apex_transformer_log_level: 30 + gradient_as_bucket_view: true + sync_batch_comm: false + activations_checkpoint_granularity: null + activations_checkpoint_method: null + activations_checkpoint_num_layers: null + num_micro_batches_with_partial_activation_checkpoints: null + activations_checkpoint_layers_per_pipeline: null + sequence_parallel: true + transformer_engine: true + fp8: false + fp8_e4m3: false + fp8_hybrid: true + fp8_margin: 0 + fp8_interval: 1 + fp8_amax_history_len: 1024 + fp8_amax_compute_algo: max + reduce_amax: true + use_emha: false + ub_tp_comm_overlap: false + ub_tp_comm_overlap_cfg: null + use_flash_attention: false + nsys_profile: + enabled: false + start_step: 10 + end_step: 10 + ranks: + - 0 + gen_shape: false + optim: + name: distributed_fused_adam + lr: 0.0001 + weight_decay: 0.1 + betas: + - 0.9 + - 0.95 + sched: + name: CosineAnnealing + warmup_steps: 107 + constant_steps: 11873 + min_lr: 1.0e-05 + window_size: + - 4096 + - 0 + gc_interval: 0 + precision: bf16 + mcore_customization_config: + new_decoder_architecture: false + parallel_attention: true + data: + data_impl: mmap + splits_string: "99990,8,2" + seq_length: 2048 + skip_warmup: true + num_workers: 2 + dataloader_type: single + reset_position_ids: false + reset_attention_mask: false + eod_mask_loss: false + index_mapping_dir: null + data_prefix: + - .0333 + - ${data_dir}/my-mistral_00_text_document + - .0333 + - ${data_dir}/my-mistral_01_text_document + - .0333 + - ${data_dir}/my-mistral_02_text_document + - .0333 + - ${data_dir}/my-mistral_03_text_document + - .0333 + - ${data_dir}/my-mistral_04_text_document + - .0333 + - ${data_dir}/my-mistral_05_text_document + - .0333 + - ${data_dir}/my-mistral_06_text_document + - .0333 + - ${data_dir}/my-mistral_07_text_document + - .0333 + - ${data_dir}/my-mistral_08_text_document + - .0333 + - ${data_dir}/my-mistral_09_text_document + - .0333 + - ${data_dir}/my-mistral_10_text_document + - .0333 + - ${data_dir}/my-mistral_11_text_document + - .0333 + - ${data_dir}/my-mistral_12_text_document + - .0333 + - ${data_dir}/my-mistral_13_text_document + - .0333 + - ${data_dir}/my-mistral_14_text_document + - .0333 + - ${data_dir}/my-mistral_15_text_document + - .0333 + - ${data_dir}/my-mistral_16_text_document + - .0333 + - ${data_dir}/my-mistral_17_text_document + - .0333 + - ${data_dir}/my-mistral_18_text_document + - .0333 + - ${data_dir}/my-mistral_19_text_document + - .0333 + - ${data_dir}/my-mistral_20_text_document + - .0333 + - ${data_dir}/my-mistral_21_text_document + - .0333 + - ${data_dir}/my-mistral_22_text_document + - .0333 + - ${data_dir}/my-mistral_23_text_document + - .0333 + - ${data_dir}/my-mistral_24_text_document + - .0333 + - ${data_dir}/my-mistral_25_text_document + - .0333 + - ${data_dir}/my-mistral_26_text_document + - .0333 + - ${data_dir}/my-mistral_27_text_document + - .0333 + - ${data_dir}/my-mistral_28_text_document + - .0334 + - ${data_dir}/my-mistral_29_text_document +