From c631a8a75ccec3208cb2d7761ca1ab7e131cc8fd Mon Sep 17 00:00:00 2001
From: Malay Nagda <malayn@malayn-mlt.client.nvidia.com>
Date: Tue, 9 Jul 2024 12:09:21 +0530
Subject: [PATCH 1/2] moe token drop params

---
 auto_configurator/base_configs/mixtral_3b.yaml | 3 +++
 auto_configurator/base_configs/mixtral_7b.yaml | 2 ++
 2 files changed, 5 insertions(+)

diff --git a/auto_configurator/base_configs/mixtral_3b.yaml b/auto_configurator/base_configs/mixtral_3b.yaml
index e456a43dc..fba0b07c3 100644
--- a/auto_configurator/base_configs/mixtral_3b.yaml
+++ b/auto_configurator/base_configs/mixtral_3b.yaml
@@ -47,6 +47,9 @@ exp_manager:
 model:
   mcore_gpt: true
   moe_grouped_gemm: true
+  moe_token_dispatcher_type: alltoall
+  moe_pad_expert_input_to_capacity: True
+  moe_expert_capacity_factor: 1.0
   micro_batch_size: 1
   global_batch_size: 128
   rampup_batch_size: null
diff --git a/auto_configurator/base_configs/mixtral_7b.yaml b/auto_configurator/base_configs/mixtral_7b.yaml
index 661bc55cd..32adcb6bd 100644
--- a/auto_configurator/base_configs/mixtral_7b.yaml
+++ b/auto_configurator/base_configs/mixtral_7b.yaml
@@ -48,6 +48,8 @@ model:
   mcore_gpt: true
   moe_grouped_gemm: true
   moe_token_dispatcher_type: alltoall
+  moe_pad_expert_input_to_capacity: True
+  moe_expert_capacity_factor: 1.0
   moe_aux_loss_coeff: 0.01
   micro_batch_size: 1
   global_batch_size: 256

From 1c3d475f61193925034e0067d6f4eca8eb2cfc0a Mon Sep 17 00:00:00 2001
From: Malay Nagda <malayn@malayn-mlt.client.nvidia.com>
Date: Tue, 9 Jul 2024 12:13:37 +0530
Subject: [PATCH 2/2] moe token drop params

---
 auto_configurator/base_configs/mixtral_3b.yaml           | 3 ---
 auto_configurator/base_configs/mixtral_7b.yaml           | 2 --
 launcher_scripts/conf/training/mixtral/mixtral_8x3b.yaml | 3 +++
 launcher_scripts/conf/training/mixtral/mixtral_8x7b.yaml | 2 ++
 4 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/auto_configurator/base_configs/mixtral_3b.yaml b/auto_configurator/base_configs/mixtral_3b.yaml
index fba0b07c3..e456a43dc 100644
--- a/auto_configurator/base_configs/mixtral_3b.yaml
+++ b/auto_configurator/base_configs/mixtral_3b.yaml
@@ -47,9 +47,6 @@ exp_manager:
 model:
   mcore_gpt: true
   moe_grouped_gemm: true
-  moe_token_dispatcher_type: alltoall
-  moe_pad_expert_input_to_capacity: True
-  moe_expert_capacity_factor: 1.0
   micro_batch_size: 1
   global_batch_size: 128
   rampup_batch_size: null
diff --git a/auto_configurator/base_configs/mixtral_7b.yaml b/auto_configurator/base_configs/mixtral_7b.yaml
index 32adcb6bd..661bc55cd 100644
--- a/auto_configurator/base_configs/mixtral_7b.yaml
+++ b/auto_configurator/base_configs/mixtral_7b.yaml
@@ -48,8 +48,6 @@ model:
   mcore_gpt: true
   moe_grouped_gemm: true
   moe_token_dispatcher_type: alltoall
-  moe_pad_expert_input_to_capacity: True
-  moe_expert_capacity_factor: 1.0
   moe_aux_loss_coeff: 0.01
   micro_batch_size: 1
   global_batch_size: 256
diff --git a/launcher_scripts/conf/training/mixtral/mixtral_8x3b.yaml b/launcher_scripts/conf/training/mixtral/mixtral_8x3b.yaml
index 364f756e8..c997bed57 100644
--- a/launcher_scripts/conf/training/mixtral/mixtral_8x3b.yaml
+++ b/launcher_scripts/conf/training/mixtral/mixtral_8x3b.yaml
@@ -47,6 +47,9 @@ exp_manager:
 model:
   mcore_gpt: true
   moe_grouped_gemm: true
+  moe_token_dispatcher_type: alltoall
+  moe_pad_expert_input_to_capacity: True
+  moe_expert_capacity_factor: 1.0
   micro_batch_size: 1
   global_batch_size: 128
   rampup_batch_size: null
diff --git a/launcher_scripts/conf/training/mixtral/mixtral_8x7b.yaml b/launcher_scripts/conf/training/mixtral/mixtral_8x7b.yaml
index 05daef455..7f31773e1 100644
--- a/launcher_scripts/conf/training/mixtral/mixtral_8x7b.yaml
+++ b/launcher_scripts/conf/training/mixtral/mixtral_8x7b.yaml
@@ -48,6 +48,8 @@ model:
   mcore_gpt: true
   moe_grouped_gemm: true
   moe_token_dispatcher_type: alltoall
+  moe_pad_expert_input_to_capacity: True
+  moe_expert_capacity_factor: 1.0
   moe_aux_loss_coeff: 0.01
   micro_batch_size: 1
   global_batch_size: 256