NVIDIA · thomasdhc · Jul 31, 2024 · Jul 15, 2024 · Jul 17, 2024 · Jul 23, 2024
diff --git a/launcher_scripts/conf/evaluation/peft_t5/squad.yaml b/launcher_scripts/conf/evaluation/peft_t5/squad.yaml
@@ -0,0 +1,204 @@
+run:
+  name: eval_${.task_name}_${.model_train_name}
+  time_limit: "04:00:00"
+  dependency: "singleton"
+  convert_name: convert_nemo
+  model_train_name: t5
+  task_name: "squad"  # SQuAD v1.1
+  convert_dir: ${base_results_dir}/${.model_train_name}/${.convert_name}
+  fine_tuning_dir: ${base_results_dir}/${.model_train_name}/peft_${.task_name}
+  results_dir: ${base_results_dir}/${.model_train_name}/peft_${.task_name}_eval
+
+trainer:
+  devices: 8
+  num_nodes: 1
+  accelerator: gpu
+  precision: bf16
+  logger: False # logger provided by exp_manager
+  enable_checkpointing: False
+  use_distributed_sampler: False
+  log_every_n_steps: 10
+  max_steps: 1000
+
+exp_manager:
+  explicit_log_dir: ${evaluation.run.results_dir}/results
+  exp_dir: null
+  name: megatron_t5_${evaluation.run.task_name}_eval
+  create_checkpoint_callback: False
+
+model:
+  seed: 1234
+  tensor_model_parallel_size: 1
+  pipeline_model_parallel_size: 1
+
+  global_batch_size: 32
+  micro_batch_size: 4
+  restore_from_path: ${evaluation.run.convert_dir}/results/megatron_t5.nemo # Path to converted t5 .nemo file
+  resume_from_checkpoint: null
+  save_nemo_on_validation_end: True
+  sync_batch_comm: False
+  megatron_amp_02: False
+
+  sequence_parallel: False
+
+  activations_checkpoint_granularity: null
+  activations_checkpoint_method: null
+  activations_checkpoint_num_layers: null
+  activations_checkpoint_layers_per_pipeline: null
+  answer_only_loss: False
+  gradient_as_bucket_view: False
+
+  hidden_dropout: 0.0
+  attention_dropout: 0.0
+  ffn_dropout: 0.0
+
+  peft:
+    peft_scheme: "ptuning"  # can be either adapter,ia3, or ptuning
+    restore_from_path: ${evaluation.run.fine_tuning_dir}/${.peft_scheme}/megatron_t5_peft_tuning-${.peft_scheme}/checkpoints/megatron_t5_peft_tuning-{.peft_scheme}.nemo
+    restore_from_ckpt_name: null
+    restore_from_hparams_path: null
+
+    # Used for adapter peft training
+    adapter_tuning:
+      type: 'parallel_adapter' # this should be either 'parallel_adapter' or 'linear_adapter'
+      adapter_dim: 32
+      adapter_dropout: 0.0
+      norm_position: 'pre' # This can be set to 'pre', 'post' or null, 'pre' is normally what is used.
+      column_init_method: 'xavier' # IGNORED if linear_adapter is used, options: xavier, zero or normal
+      row_init_method: 'zero' # IGNORED if linear_adapter is used, options: xavier, zero or normal
+      norm_type: 'mixedfusedlayernorm' # IGNORED if layer_adapter is used,  options are ['layernorm', 'mixedfusedlayernorm']
+      layer_selection: null  # selects in which layers to add adapters, e.g. [1,12] will add adapters to layer 1 (lowest) and 12. null will apply adapters to all layers
+      weight_tying: False
+      position_embedding_strategy: null # used only when weight_tying is True
+
+    lora_tuning:
+      adapter_dim: 32
+      adapter_dropout: 0.0
+      column_init_method: 'xavier' # IGNORED if linear_adapter is used, options: xavier, zero or normal
+      row_init_method: 'zero' # IGNORED if linear_adapter is used, options: xavier, zero or normal
+      layer_selection:  null  # selects in which layers to add lora adapters. e.g. [1,12] will add lora to layer 1 (lowest) and 12. null will apply adapters to all layers
+      weight_tying: False
+      position_embedding_strategy: null # used only when weight_tying is True
+
+    # Used for p-tuning peft training
+    p_tuning:
+      virtual_tokens: 10  # The number of virtual tokens the prompt encoder should add at the start of the sequence
+      bottleneck_dim: 1024  # the size of the prompt encoder mlp bottleneck
+      embedding_dim: 1024  # the size of the prompt encoder embeddings
+      init_std: 0.023
+
+    ia3_tuning:
+      layer_selection:  null  # selects in which layers to add ia3 adapters. e.g. [1,12] will add lora to layer 1 (lowest) and 12. null will apply adapters to all layers 
+
+  data:
+    test_ds:
+      src_file_name: ${data_dir}/squad_data/v1.1/train-v1.1_src.txt
+      tgt_file_name: ${data_dir}/squad_data/v1.1/train-v1.1_tgt.txt
+      test_names:
+        - squad
+      global_batch_size: 512
+      micro_batch_size: 64
+      shuffle: False
+      num_workers: 4
+      pin_memory: True
+      max_seq_length: 2048
+      min_seq_length: 1
+      drop_last: False
+      context_key: 'input'
+      label_key: 'output'   
+      add_eos: True
+      add_sep: False
+      add_bos: False
+      max_src_seq_length: 512
+      max_tgt_seq_length: 128
+      write_predictions_to_file: True
+      output_file_path_prefix: ${evaluation.run.results_dir}/results/predictions
+      truncation_field: "context"
+      index_mapping_dir: null
+      prompt_template: "{input} {output}"
+      tokens_to_generate: 20
+      truncation_method: 'right'
+
+      metric:
+        name: "exact_string_match" # Name of the evaluation metric to use.
+        average: null # Average the metric over the dataset. Options: ['macro', 'micro']. Works only for 'F1', 'accuracy' etc. Refer to torchmetrics for metrics where this is supported.
+        num_classes: null
+
+inference:
+  greedy: True # Whether or not to use sampling ; use greedy decoding otherwise
+  top_k: 0  # The number of highest probability vocabulary tokens to keep for top-k-filtering.
+  top_p: 0.9 # If set to float < 1, only the most probable tokens with probabilities that add up to top_p or higher are kept for generation.
+  temperature: 1.0 # sampling temperature
+  all_probs: False  # whether return the log prob for all the tokens in vocab
+  repetition_penalty: 1.2  # The parameter for repetition penalty. 1.0 means no penalty.
+  min_tokens_to_generate: 0  # The minimum length of the sequence to be generated.
+  compute_logprob: False  # a flag used to compute logprob of all the input text, a very special case of running inference, default False
+  outfile_path: output.txt
+  compute_attention_mask: True
+
+# server-related configs
+server: False  # whether launch the API server
+port: 5555 # the port number for the inference server
+web_server: False # whether launch the web inference server
+share: True  # whether create a public URL
+username: test # user name for web client
+password: test2  # password for web client
+web_port: 9889 # the port number of the web server 1058
+chat: False # use the chat interface
+chatbot_config:
+  value: False   # whether to inject the value attributes
+  attributes:
+    - name: Quality
+      min: 0
+      max: 4
+      key: quality
+      type: int
+      default: 4
+    - name: Toxicity
+      min: 0
+      max: 4
+      key: toxcity
+      type: int
+      default: 0
+    - name: Humor
+      min: 0
+      max: 4
+      key: humor
+      type: int
+      default: 0
+    - name: Creativity
+      min: 0
+      max: 4
+      key: creativity
+      type: int
+      default: 0
+    - name: Violence
+      min: 0
+      max: 4
+      key: violence
+      type: int
+      default: 0
+    - name: Helpfulness
+      min: 0
+      max: 4
+      key: helpfulness
+      type: int
+      default: 4
+    - name: Not_Appropriate
+      min: 0
+      max: 4
+      key: not_appropriate
+      type: int
+      default: 0
+    - name: Language
+      choices: ['ar', 'bg', 'bn', 'ca', 'cs', 'da', 'de', 'el', 'en', 'eo', 'es', 'eu', 'fa', 'fi', 'fr', 'gl', 'he', 'hu', 'id', 'it', 'ja', 'ko', 'nb', 'nl', 'pl', 'pt', 'ro', 'ru', 'sk', 'sv', 'th', 'tr', 'uk', 'vi', 'zh']
+      key: lang
+      type: list
+      default: en
+
+  user: User
+  assistant: Assistant
+  system: "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\n\n"
+
+
+
diff --git a/launcher_scripts/conf/peft/t5/squad.yaml b/launcher_scripts/conf/peft/t5/squad.yaml
@@ -14,7 +14,7 @@ trainer:
   devices: 8
   accelerator: gpu
   num_nodes: 1
-  precision: bf16
+  precision: 16
   logger: False # logger provided by exp_manager
   enable_checkpointing: False
   use_distributed_sampler: False
@@ -58,7 +58,7 @@ model:
   tensor_model_parallel_size: 1 # intra-layer model parallelism
   pipeline_model_parallel_size: 1 # inter-layer model parallelism
 
-  global_batch_size: 128
+  global_batch_size: 32
   micro_batch_size: 4
   restore_from_path: ??? # Path to an existing .nemo model you wish to add new tasks to or run inference with
   resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc.
@@ -131,8 +131,10 @@ model:
       #   - /path/to/boolq.jsonl
       # Example of how each dataset is formatted
       # {'input': 'John von Neumann\nVon Neumann made fundamental contributions .... Q: What did the math of artificial viscosity do?', 'output': 'smoothed the shock transition without sacrificing basic physics'}
-      file_names:
-      - ${data_dir}/squad_data/v1.1/train-v1.1_gpt.json # Path to a list of JSONL files corresponding to the source data.
+      src_file_name: ${data_dir}/squad_data/v1.1/train-v1.1_src.txt
+      tgt_file_name: ${data_dir}/squad_data/v1.1/train-v1.1_tgt.txt
+      max_src_seq_length: 512
+      max_tgt_seq_length: 128
       global_batch_size: ${peft.model.global_batch_size}
       micro_batch_size: ${peft.model.micro_batch_size}
       shuffle: True
@@ -160,8 +162,10 @@ model:
       prompt_template: "{input} {output}" # fstring to use for assistant prompt. Example: "Q: {input}\nA: {output}"
 
     validation_ds:
-      file_names: 
-      - ${data_dir}/squad_data/v1.1/dev-v1.1_gpt.json # Path to a list of JSONL files corresponding to the source data. Data format is identical to train_ds.
+      src_file_name: ${data_dir}/squad_data/v1.1/dev-v1.1_src.txt
+      tgt_file_name: ${data_dir}/squad_data/v1.1/dev-v1.1_tgt.txt
+      max_src_seq_length: 512
+      max_tgt_seq_length: 128
       names:
       - ${peft.run.task_name} # Names of the corresponding datasets used to log metrics.
       global_batch_size: ${peft.model.global_batch_size}
@@ -190,7 +194,10 @@ model:
         average: null # Average the metric over the dataset. Options: ['macro', 'micro']. Works only for 'F1', 'accuracy' etc. Refer to torchmetrics for metrics where this is supported.
         num_classes: null
     test_ds:
-        file_names: ${peft.model.data.validation_ds.file_names} # Path to a list of JSONL files corresponding to the source data. Data format is identical to train_ds.
+        src_file_name: ${data_dir}/squad_data/v1.1/dev-v1.1_src.txt
+        tgt_file_name: ${data_dir}/squad_data/v1.1/dev-v1.1_tgt.txt
+        max_src_seq_length: 512
+        max_tgt_seq_length: 128
         names: null # Names of the corresponding datasets used to log metrics.
         global_batch_size: ${peft.model.global_batch_size}
         micro_batch_size: ${peft.model.micro_batch_size}

diff --git a/launcher_scripts/conf/training/mt5/11b.yaml b/launcher_scripts/conf/training/mt5/11b.yaml
@@ -53,9 +53,9 @@ exp_manager:
     buffer_size: 5
 
 model:
-  mcore_t5: False
-  transformer_engine: False
-  
+  mcore_t5: True
+  transformer_engine: True
+
   # model parallelism
   micro_batch_size: 24
   global_batch_size: 1920 # will use more micro batches to reach global batch size

diff --git a/launcher_scripts/conf/training/mt5/170m.yaml b/launcher_scripts/conf/training/mt5/170m.yaml
@@ -53,9 +53,9 @@ exp_manager:
     buffer_size: 5
 
 model:
-  mcore_t5: False
-  transformer_engine: False
-  
+  mcore_t5: True
+  transformer_engine: True
+
   # model parallelism
   micro_batch_size: 64
   global_batch_size: 2048 # will use more micro batches to reach global batch size

diff --git a/launcher_scripts/conf/training/mt5/23b.yaml b/launcher_scripts/conf/training/mt5/23b.yaml
@@ -53,9 +53,9 @@ exp_manager:
     buffer_size: 5
 
 model:
-  mcore_t5: False
-  transformer_engine: False
-  
+  mcore_t5: True
+  transformer_engine: True
+
   # model parallelism
   micro_batch_size: 8
   global_batch_size: 1920 # will use more micro batches to reach global batch size

diff --git a/launcher_scripts/conf/training/mt5/390m.yaml b/launcher_scripts/conf/training/mt5/390m.yaml
@@ -53,9 +53,9 @@ exp_manager:
     buffer_size: 5
 
 model:
-  mcore_t5: False
-  transformer_engine: False
-  
+  mcore_t5: True
+  transformer_engine: True
+
   # model parallelism
   micro_batch_size: 32
   global_batch_size: 2048 # will use more micro batches to reach global batch size

diff --git a/launcher_scripts/conf/training/mt5/3b.yaml b/launcher_scripts/conf/training/mt5/3b.yaml
@@ -53,9 +53,9 @@ exp_manager:
     buffer_size: 5
 
 model:
-  mcore_t5: False
-  transformer_engine: False
-  
+  mcore_t5: True
+  transformer_engine: True
+
   # model parallelism
   micro_batch_size: 24
   global_batch_size: 1920 # will use more micro batches to reach global batch size

diff --git a/launcher_scripts/conf/training/t5/11b.yaml b/launcher_scripts/conf/training/t5/11b.yaml
@@ -51,9 +51,9 @@ exp_manager:
     buffer_size: 5
 
 model:
-  mcore_t5: False
-  transformer_engine: False
-  
+  mcore_t5: True
+  transformer_engine: True
+
   # model parallelism
   micro_batch_size: 24
   global_batch_size: 1920 # will use more micro batches to reach global batch size

diff --git a/launcher_scripts/conf/training/t5/220m.yaml b/launcher_scripts/conf/training/t5/220m.yaml
@@ -51,8 +51,8 @@ exp_manager:
     buffer_size: 5
 
 model:
-  mcore_t5: False
-  transformer_engine: False
+  mcore_t5: True
+  transformer_engine: True
 
   # model parallelism
   micro_batch_size: 64

diff --git a/launcher_scripts/conf/training/t5/23b.yaml b/launcher_scripts/conf/training/t5/23b.yaml
@@ -51,9 +51,9 @@ exp_manager:
     buffer_size: 5
 
 model:
-  mcore_t5: False
-  transformer_engine: False
-  
+  mcore_t5: True
+  transformer_engine: True
+
   # model parallelism
   micro_batch_size: 8
   global_batch_size: 1920 # will use more micro batches to reach global batch size

diff --git a/launcher_scripts/conf/training/t5/3b.yaml b/launcher_scripts/conf/training/t5/3b.yaml
@@ -51,9 +51,9 @@ exp_manager:
     buffer_size: 5
 
 model:
-  mcore_t5: False
-  transformer_engine: False
-  
+  mcore_t5: True
+  transformer_engine: True
+
   # model parallelism
   micro_batch_size: 24
   global_batch_size: 1920 # will use more micro batches to reach global batch size

diff --git a/launcher_scripts/conf/training/t5/41b.yaml b/launcher_scripts/conf/training/t5/41b.yaml
@@ -51,9 +51,9 @@ exp_manager:
     buffer_size: 5
 
 model:
-  mcore_t5: False
-  transformer_engine: False
-  
+  mcore_t5: True
+  transformer_engine: True
+
   # model parallelism
   micro_batch_size: 6
   global_batch_size: 1920 # will use more micro batches to reach global batch size

diff --git a/launcher_scripts/main.py b/launcher_scripts/main.py
@@ -115,6 +115,7 @@
             "peft_mistral",
             "peft_mixtral",
             "peft_qwen2",
+            "peft_t5",
         ],
         DiffusionModelEvaluation: ["stable_diffusion", "imagen"],
     },