From c631a8a75ccec3208cb2d7761ca1ab7e131cc8fd Mon Sep 17 00:00:00 2001 From: Malay Nagda Date: Tue, 9 Jul 2024 12:09:21 +0530 Subject: [PATCH 1/2] moe token drop params --- auto_configurator/base_configs/mixtral_3b.yaml | 3 +++ auto_configurator/base_configs/mixtral_7b.yaml | 2 ++ 2 files changed, 5 insertions(+) diff --git a/auto_configurator/base_configs/mixtral_3b.yaml b/auto_configurator/base_configs/mixtral_3b.yaml index e456a43dc..fba0b07c3 100644 --- a/auto_configurator/base_configs/mixtral_3b.yaml +++ b/auto_configurator/base_configs/mixtral_3b.yaml @@ -47,6 +47,9 @@ exp_manager: model: mcore_gpt: true moe_grouped_gemm: true + moe_token_dispatcher_type: alltoall + moe_pad_expert_input_to_capacity: True + moe_expert_capacity_factor: 1.0 micro_batch_size: 1 global_batch_size: 128 rampup_batch_size: null diff --git a/auto_configurator/base_configs/mixtral_7b.yaml b/auto_configurator/base_configs/mixtral_7b.yaml index 661bc55cd..32adcb6bd 100644 --- a/auto_configurator/base_configs/mixtral_7b.yaml +++ b/auto_configurator/base_configs/mixtral_7b.yaml @@ -48,6 +48,8 @@ model: mcore_gpt: true moe_grouped_gemm: true moe_token_dispatcher_type: alltoall + moe_pad_expert_input_to_capacity: True + moe_expert_capacity_factor: 1.0 moe_aux_loss_coeff: 0.01 micro_batch_size: 1 global_batch_size: 256 From 1c3d475f61193925034e0067d6f4eca8eb2cfc0a Mon Sep 17 00:00:00 2001 From: Malay Nagda Date: Tue, 9 Jul 2024 12:13:37 +0530 Subject: [PATCH 2/2] moe token drop params --- auto_configurator/base_configs/mixtral_3b.yaml | 3 --- auto_configurator/base_configs/mixtral_7b.yaml | 2 -- launcher_scripts/conf/training/mixtral/mixtral_8x3b.yaml | 3 +++ launcher_scripts/conf/training/mixtral/mixtral_8x7b.yaml | 2 ++ 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/auto_configurator/base_configs/mixtral_3b.yaml b/auto_configurator/base_configs/mixtral_3b.yaml index fba0b07c3..e456a43dc 100644 --- a/auto_configurator/base_configs/mixtral_3b.yaml +++ b/auto_configurator/base_configs/mixtral_3b.yaml @@ -47,9 +47,6 @@ exp_manager: model: mcore_gpt: true moe_grouped_gemm: true - moe_token_dispatcher_type: alltoall - moe_pad_expert_input_to_capacity: True - moe_expert_capacity_factor: 1.0 micro_batch_size: 1 global_batch_size: 128 rampup_batch_size: null diff --git a/auto_configurator/base_configs/mixtral_7b.yaml b/auto_configurator/base_configs/mixtral_7b.yaml index 32adcb6bd..661bc55cd 100644 --- a/auto_configurator/base_configs/mixtral_7b.yaml +++ b/auto_configurator/base_configs/mixtral_7b.yaml @@ -48,8 +48,6 @@ model: mcore_gpt: true moe_grouped_gemm: true moe_token_dispatcher_type: alltoall - moe_pad_expert_input_to_capacity: True - moe_expert_capacity_factor: 1.0 moe_aux_loss_coeff: 0.01 micro_batch_size: 1 global_batch_size: 256 diff --git a/launcher_scripts/conf/training/mixtral/mixtral_8x3b.yaml b/launcher_scripts/conf/training/mixtral/mixtral_8x3b.yaml index 364f756e8..c997bed57 100644 --- a/launcher_scripts/conf/training/mixtral/mixtral_8x3b.yaml +++ b/launcher_scripts/conf/training/mixtral/mixtral_8x3b.yaml @@ -47,6 +47,9 @@ exp_manager: model: mcore_gpt: true moe_grouped_gemm: true + moe_token_dispatcher_type: alltoall + moe_pad_expert_input_to_capacity: True + moe_expert_capacity_factor: 1.0 micro_batch_size: 1 global_batch_size: 128 rampup_batch_size: null diff --git a/launcher_scripts/conf/training/mixtral/mixtral_8x7b.yaml b/launcher_scripts/conf/training/mixtral/mixtral_8x7b.yaml index 05daef455..7f31773e1 100644 --- a/launcher_scripts/conf/training/mixtral/mixtral_8x7b.yaml +++ b/launcher_scripts/conf/training/mixtral/mixtral_8x7b.yaml @@ -48,6 +48,8 @@ model: mcore_gpt: true moe_grouped_gemm: true moe_token_dispatcher_type: alltoall + moe_pad_expert_input_to_capacity: True + moe_expert_capacity_factor: 1.0 moe_aux_loss_coeff: 0.01 micro_batch_size: 1 global_batch_size: 256