diff --git a/launcher_scripts/conf/fine_tuning/mt5/custom_task.yaml b/launcher_scripts/conf/fine_tuning/mt5/custom_task.yaml index abd3c2565c..ea0138badb 100755 --- a/launcher_scripts/conf/fine_tuning/mt5/custom_task.yaml +++ b/launcher_scripts/conf/fine_tuning/mt5/custom_task.yaml @@ -45,6 +45,10 @@ exp_manager: save_best_model: True model: # For different fine_tuning tasks, tuning the hyper parameters accordingly + # use MT5 model from megatron.core + mcore_t5: True + transformer_engine: False + restore_from_path: ${fine_tuning.run.convert_dir}/results/megatron_mt5.nemo # Path to a trained mt5 .nemo file pretrained_checkpoint: checkpoint_dir: null # Path to a folder that contains a .ckpt file diff --git a/launcher_scripts/conf/fine_tuning/mt5/xquad.yaml b/launcher_scripts/conf/fine_tuning/mt5/xquad.yaml index 8190e47aa5..1a04661f2d 100755 --- a/launcher_scripts/conf/fine_tuning/mt5/xquad.yaml +++ b/launcher_scripts/conf/fine_tuning/mt5/xquad.yaml @@ -45,6 +45,10 @@ exp_manager: save_best_model: True model: # For different fine_tuning tasks, tuning the hyper parameters accordingly + # use MT5 model from megatron.core + mcore_t5: True + transformer_engine: False + restore_from_path: ${fine_tuning.run.convert_dir}/results/megatron_mt5.nemo # Path to a trained mt5 .nemo file pretrained_checkpoint: checkpoint_dir: null # Path to a folder that contains a .ckpt file diff --git a/launcher_scripts/conf/fine_tuning/t5/custom_task.yaml b/launcher_scripts/conf/fine_tuning/t5/custom_task.yaml index 54c3166405..8fa355f121 100755 --- a/launcher_scripts/conf/fine_tuning/t5/custom_task.yaml +++ b/launcher_scripts/conf/fine_tuning/t5/custom_task.yaml @@ -45,6 +45,10 @@ exp_manager: save_best_model: True model: # For different fine_tuning tasks, tuning the hyper parameters accordingly; below is only for MNLI + # use T5 model from megatron.core + mcore_t5: True + transformer_engine: False + restore_from_path: ${fine_tuning.run.convert_dir}/results/megatron_t5.nemo # Path to a trained T5 .nemo file pretrained_checkpoint: checkpoint_dir: null # Path to a folder that contains a .ckpt file diff --git a/launcher_scripts/conf/fine_tuning/t5/squad.yaml b/launcher_scripts/conf/fine_tuning/t5/squad.yaml index d608fd28ec..66f9371a4c 100755 --- a/launcher_scripts/conf/fine_tuning/t5/squad.yaml +++ b/launcher_scripts/conf/fine_tuning/t5/squad.yaml @@ -45,6 +45,10 @@ exp_manager: save_best_model: True model: # For different fine_tuning tasks, tuning the hyper parameters accordingly; below is only for MNLI + # use T5 model from megatron.core + mcore_t5: True + transformer_engine: False + restore_from_path: ${fine_tuning.run.convert_dir}/results/megatron_t5.nemo # Path to a trained T5 .nemo file pretrained_checkpoint: checkpoint_dir: null # Path to a folder that contains a .ckpt file diff --git a/launcher_scripts/conf/peft/t5/squad.yaml b/launcher_scripts/conf/peft/t5/squad.yaml index 9c8670dea7..add172fcec 100644 --- a/launcher_scripts/conf/peft/t5/squad.yaml +++ b/launcher_scripts/conf/peft/t5/squad.yaml @@ -54,6 +54,10 @@ exp_manager: strict: False # Should be False to avoid a runtime error where EarlyStopping says monitor is unavailable, which sometimes happens with resumed training. model: + # use T5 model from megatron.core + mcore_t5: True + transformer_engine: False + seed: 1234 tensor_model_parallel_size: 1 # intra-layer model parallelism pipeline_model_parallel_size: 1 # inter-layer model parallelism diff --git a/launcher_scripts/conf/training/mt5/11b.yaml b/launcher_scripts/conf/training/mt5/11b.yaml index 3111159db4..50af98b666 100755 --- a/launcher_scripts/conf/training/mt5/11b.yaml +++ b/launcher_scripts/conf/training/mt5/11b.yaml @@ -53,6 +53,10 @@ exp_manager: buffer_size: 5 model: + # use MT5 model from megatron.core + mcore_t5: True + transformer_engine: False + # model parallelism micro_batch_size: 24 global_batch_size: 1920 # will use more micro batches to reach global batch size diff --git a/launcher_scripts/conf/training/mt5/170m.yaml b/launcher_scripts/conf/training/mt5/170m.yaml index b166c26496..ad1e28df91 100755 --- a/launcher_scripts/conf/training/mt5/170m.yaml +++ b/launcher_scripts/conf/training/mt5/170m.yaml @@ -53,6 +53,10 @@ exp_manager: buffer_size: 5 model: + # use MT5 model from megatron.core + mcore_t5: True + transformer_engine: False + # model parallelism micro_batch_size: 64 global_batch_size: 2048 # will use more micro batches to reach global batch size diff --git a/launcher_scripts/conf/training/mt5/23b.yaml b/launcher_scripts/conf/training/mt5/23b.yaml index dab9d9504e..1059115e15 100755 --- a/launcher_scripts/conf/training/mt5/23b.yaml +++ b/launcher_scripts/conf/training/mt5/23b.yaml @@ -53,6 +53,10 @@ exp_manager: buffer_size: 5 model: + # use MT5 model from megatron.core + mcore_t5: True + transformer_engine: False + # model parallelism micro_batch_size: 8 global_batch_size: 1920 # will use more micro batches to reach global batch size diff --git a/launcher_scripts/conf/training/mt5/390m.yaml b/launcher_scripts/conf/training/mt5/390m.yaml index c03436bb8b..fde322e812 100755 --- a/launcher_scripts/conf/training/mt5/390m.yaml +++ b/launcher_scripts/conf/training/mt5/390m.yaml @@ -53,6 +53,10 @@ exp_manager: buffer_size: 5 model: + # use MT5 model from megatron.core + mcore_t5: True + transformer_engine: False + # model parallelism micro_batch_size: 32 global_batch_size: 2048 # will use more micro batches to reach global batch size diff --git a/launcher_scripts/conf/training/mt5/3b.yaml b/launcher_scripts/conf/training/mt5/3b.yaml index 96b2c367bb..e30d078c66 100755 --- a/launcher_scripts/conf/training/mt5/3b.yaml +++ b/launcher_scripts/conf/training/mt5/3b.yaml @@ -53,6 +53,10 @@ exp_manager: buffer_size: 5 model: + # use MT5 model from megatron.core + mcore_t5: True + transformer_engine: False + # model parallelism micro_batch_size: 24 global_batch_size: 1920 # will use more micro batches to reach global batch size diff --git a/launcher_scripts/conf/training/t5/11b.yaml b/launcher_scripts/conf/training/t5/11b.yaml index 0f47b6e5e7..a2dccda4e6 100755 --- a/launcher_scripts/conf/training/t5/11b.yaml +++ b/launcher_scripts/conf/training/t5/11b.yaml @@ -51,6 +51,10 @@ exp_manager: buffer_size: 5 model: + # use T5 model from megatron.core + mcore_t5: True + transformer_engine: False + # model parallelism micro_batch_size: 24 global_batch_size: 1920 # will use more micro batches to reach global batch size diff --git a/launcher_scripts/conf/training/t5/220m.yaml b/launcher_scripts/conf/training/t5/220m.yaml index 73f56344a5..3bc4b85b15 100755 --- a/launcher_scripts/conf/training/t5/220m.yaml +++ b/launcher_scripts/conf/training/t5/220m.yaml @@ -51,6 +51,10 @@ exp_manager: buffer_size: 5 model: + # use T5 model from megatron.core + mcore_t5: True + transformer_engine: False + # model parallelism micro_batch_size: 64 global_batch_size: 2048 # will use more micro batches to reach global batch size diff --git a/launcher_scripts/conf/training/t5/23b.yaml b/launcher_scripts/conf/training/t5/23b.yaml index 1050285cc7..aded437f91 100755 --- a/launcher_scripts/conf/training/t5/23b.yaml +++ b/launcher_scripts/conf/training/t5/23b.yaml @@ -51,6 +51,10 @@ exp_manager: buffer_size: 5 model: + # use T5 model from megatron.core + mcore_t5: True + transformer_engine: False + # model parallelism micro_batch_size: 8 global_batch_size: 1920 # will use more micro batches to reach global batch size diff --git a/launcher_scripts/conf/training/t5/3b.yaml b/launcher_scripts/conf/training/t5/3b.yaml index 02c51654fc..d105754787 100755 --- a/launcher_scripts/conf/training/t5/3b.yaml +++ b/launcher_scripts/conf/training/t5/3b.yaml @@ -51,6 +51,10 @@ exp_manager: buffer_size: 5 model: + # use T5 model from megatron.core + mcore_t5: True + transformer_engine: False + # model parallelism micro_batch_size: 24 global_batch_size: 1920 # will use more micro batches to reach global batch size diff --git a/launcher_scripts/conf/training/t5/41b.yaml b/launcher_scripts/conf/training/t5/41b.yaml index 599e389f16..64750455c8 100755 --- a/launcher_scripts/conf/training/t5/41b.yaml +++ b/launcher_scripts/conf/training/t5/41b.yaml @@ -51,6 +51,10 @@ exp_manager: buffer_size: 5 model: + # use T5 model from megatron.core + mcore_t5: True + transformer_engine: False + # model parallelism micro_batch_size: 6 global_batch_size: 1920 # will use more micro batches to reach global batch size