NVIDIA · pablo-garay · Jul 9, 2024 · Jul 9, 2024
diff --git a/launcher_scripts/conf/fine_tuning/mt5/custom_task.yaml b/launcher_scripts/conf/fine_tuning/mt5/custom_task.yaml
@@ -45,6 +45,10 @@ exp_manager:
     save_best_model: True
 
 model: # For different fine_tuning tasks, tuning the hyper parameters accordingly
+  # use MT5 model from megatron.core
+  mcore_t5: True
+  transformer_engine: False
+
   restore_from_path: ${fine_tuning.run.convert_dir}/results/megatron_mt5.nemo # Path to a trained mt5 .nemo file
   pretrained_checkpoint:
     checkpoint_dir: null # Path to a folder that contains a .ckpt file

diff --git a/launcher_scripts/conf/fine_tuning/mt5/xquad.yaml b/launcher_scripts/conf/fine_tuning/mt5/xquad.yaml
@@ -45,6 +45,10 @@ exp_manager:
     save_best_model: True
 
 model: # For different fine_tuning tasks, tuning the hyper parameters accordingly
+  # use MT5 model from megatron.core
+  mcore_t5: True
+  transformer_engine: False
+
   restore_from_path: ${fine_tuning.run.convert_dir}/results/megatron_mt5.nemo # Path to a trained mt5 .nemo file
   pretrained_checkpoint:
     checkpoint_dir: null # Path to a folder that contains a .ckpt file

diff --git a/launcher_scripts/conf/fine_tuning/t5/custom_task.yaml b/launcher_scripts/conf/fine_tuning/t5/custom_task.yaml
@@ -45,6 +45,10 @@ exp_manager:
     save_best_model: True
 
 model: # For different fine_tuning tasks, tuning the hyper parameters accordingly; below is only for MNLI
+  # use T5 model from megatron.core
+  mcore_t5: True
+  transformer_engine: False
+
   restore_from_path: ${fine_tuning.run.convert_dir}/results/megatron_t5.nemo # Path to a trained T5 .nemo file
   pretrained_checkpoint:
     checkpoint_dir: null # Path to a folder that contains a .ckpt file

diff --git a/launcher_scripts/conf/fine_tuning/t5/squad.yaml b/launcher_scripts/conf/fine_tuning/t5/squad.yaml
@@ -45,6 +45,10 @@ exp_manager:
     save_best_model: True
 
 model: # For different fine_tuning tasks, tuning the hyper parameters accordingly; below is only for MNLI
+  # use T5 model from megatron.core
+  mcore_t5: True
+  transformer_engine: False
+
   restore_from_path: ${fine_tuning.run.convert_dir}/results/megatron_t5.nemo # Path to a trained T5 .nemo file
   pretrained_checkpoint:
     checkpoint_dir: null # Path to a folder that contains a .ckpt file

diff --git a/launcher_scripts/conf/peft/t5/squad.yaml b/launcher_scripts/conf/peft/t5/squad.yaml
@@ -54,6 +54,10 @@ exp_manager:
     strict: False # Should be False to avoid a runtime error where EarlyStopping says monitor is unavailable, which sometimes happens with resumed training.
 
 model:
+  # use T5 model from megatron.core
+  mcore_t5: True
+  transformer_engine: False
+
   seed: 1234
   tensor_model_parallel_size: 1 # intra-layer model parallelism
   pipeline_model_parallel_size: 1 # inter-layer model parallelism

diff --git a/launcher_scripts/conf/training/mt5/11b.yaml b/launcher_scripts/conf/training/mt5/11b.yaml
@@ -53,6 +53,10 @@ exp_manager:
     buffer_size: 5
 
 model:
+  # use MT5 model from megatron.core
+  mcore_t5: True
+  transformer_engine: False
+
   # model parallelism
   micro_batch_size: 24
   global_batch_size: 1920 # will use more micro batches to reach global batch size

diff --git a/launcher_scripts/conf/training/mt5/170m.yaml b/launcher_scripts/conf/training/mt5/170m.yaml
@@ -53,6 +53,10 @@ exp_manager:
     buffer_size: 5
 
 model:
+  # use MT5 model from megatron.core
+  mcore_t5: True
+  transformer_engine: False
+
   # model parallelism
   micro_batch_size: 64
   global_batch_size: 2048 # will use more micro batches to reach global batch size

diff --git a/launcher_scripts/conf/training/mt5/23b.yaml b/launcher_scripts/conf/training/mt5/23b.yaml
@@ -53,6 +53,10 @@ exp_manager:
     buffer_size: 5
 
 model:
+  # use MT5 model from megatron.core
+  mcore_t5: True
+  transformer_engine: False
+
   # model parallelism
   micro_batch_size: 8
   global_batch_size: 1920 # will use more micro batches to reach global batch size

diff --git a/launcher_scripts/conf/training/mt5/390m.yaml b/launcher_scripts/conf/training/mt5/390m.yaml
@@ -53,6 +53,10 @@ exp_manager:
     buffer_size: 5
 
 model:
+  # use MT5 model from megatron.core
+  mcore_t5: True
+  transformer_engine: False
+
   # model parallelism
   micro_batch_size: 32
   global_batch_size: 2048 # will use more micro batches to reach global batch size

diff --git a/launcher_scripts/conf/training/mt5/3b.yaml b/launcher_scripts/conf/training/mt5/3b.yaml
@@ -53,6 +53,10 @@ exp_manager:
     buffer_size: 5
 
 model:
+  # use MT5 model from megatron.core
+  mcore_t5: True
+  transformer_engine: False
+
   # model parallelism
   micro_batch_size: 24
   global_batch_size: 1920 # will use more micro batches to reach global batch size

diff --git a/launcher_scripts/conf/training/t5/11b.yaml b/launcher_scripts/conf/training/t5/11b.yaml
@@ -51,6 +51,10 @@ exp_manager:
     buffer_size: 5
 
 model:
+  # use T5 model from megatron.core
+  mcore_t5: True
+  transformer_engine: False
+
   # model parallelism
   micro_batch_size: 24
   global_batch_size: 1920 # will use more micro batches to reach global batch size

diff --git a/launcher_scripts/conf/training/t5/220m.yaml b/launcher_scripts/conf/training/t5/220m.yaml
@@ -51,6 +51,10 @@ exp_manager:
     buffer_size: 5
 
 model:
+  # use T5 model from megatron.core
+  mcore_t5: True
+  transformer_engine: False
+
   # model parallelism
   micro_batch_size: 64
   global_batch_size: 2048 # will use more micro batches to reach global batch size

diff --git a/launcher_scripts/conf/training/t5/23b.yaml b/launcher_scripts/conf/training/t5/23b.yaml
@@ -51,6 +51,10 @@ exp_manager:
     buffer_size: 5
 
 model:
+  # use T5 model from megatron.core
+  mcore_t5: True
+  transformer_engine: False
+
   # model parallelism
   micro_batch_size: 8
   global_batch_size: 1920 # will use more micro batches to reach global batch size

diff --git a/launcher_scripts/conf/training/t5/3b.yaml b/launcher_scripts/conf/training/t5/3b.yaml
@@ -51,6 +51,10 @@ exp_manager:
     buffer_size: 5
 
 model:
+  # use T5 model from megatron.core
+  mcore_t5: True
+  transformer_engine: False
+
   # model parallelism
   micro_batch_size: 24
   global_batch_size: 1920 # will use more micro batches to reach global batch size

diff --git a/launcher_scripts/conf/training/t5/41b.yaml b/launcher_scripts/conf/training/t5/41b.yaml
@@ -51,6 +51,10 @@ exp_manager:
     buffer_size: 5
 
 model:
+  # use T5 model from megatron.core
+  mcore_t5: True
+  transformer_engine: False
+
   # model parallelism
   micro_batch_size: 6
   global_batch_size: 1920 # will use more micro batches to reach global batch size