Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Launcher Mcore T5 model for training/sft/eval/peft #383

Merged
merged 5 commits into from
Jul 31, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
204 changes: 204 additions & 0 deletions launcher_scripts/conf/evaluation/peft_t5/squad.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,204 @@
run:
name: eval_${.task_name}_${.model_train_name}
time_limit: "04:00:00"
dependency: "singleton"
convert_name: convert_nemo
model_train_name: t5
task_name: "squad" # SQuAD v1.1
convert_dir: ${base_results_dir}/${.model_train_name}/${.convert_name}
fine_tuning_dir: ${base_results_dir}/${.model_train_name}/peft_${.task_name}
results_dir: ${base_results_dir}/${.model_train_name}/peft_${.task_name}_eval

trainer:
devices: 8
num_nodes: 1
accelerator: gpu
precision: bf16
logger: False # logger provided by exp_manager
enable_checkpointing: False
use_distributed_sampler: False
log_every_n_steps: 10
max_steps: 1000

exp_manager:
explicit_log_dir: ${evaluation.run.results_dir}/results
exp_dir: null
name: megatron_t5_${evaluation.run.task_name}_eval
create_checkpoint_callback: False

model:
seed: 1234
tensor_model_parallel_size: 1
pipeline_model_parallel_size: 1

global_batch_size: 32
micro_batch_size: 4
restore_from_path: ${evaluation.run.convert_dir}/results/megatron_t5.nemo # Path to converted t5 .nemo file
resume_from_checkpoint: null
save_nemo_on_validation_end: True
sync_batch_comm: False
megatron_amp_02: False

sequence_parallel: False

activations_checkpoint_granularity: null
activations_checkpoint_method: null
activations_checkpoint_num_layers: null
activations_checkpoint_layers_per_pipeline: null
answer_only_loss: False
gradient_as_bucket_view: False

hidden_dropout: 0.0
attention_dropout: 0.0
ffn_dropout: 0.0

peft:
peft_scheme: "ptuning" # can be either adapter,ia3, or ptuning
restore_from_path: ${evaluation.run.fine_tuning_dir}/${.peft_scheme}/megatron_t5_peft_tuning-${.peft_scheme}/checkpoints/megatron_t5_peft_tuning-{.peft_scheme}.nemo
restore_from_ckpt_name: null
restore_from_hparams_path: null

# Used for adapter peft training
adapter_tuning:
type: 'parallel_adapter' # this should be either 'parallel_adapter' or 'linear_adapter'
adapter_dim: 32
adapter_dropout: 0.0
norm_position: 'pre' # This can be set to 'pre', 'post' or null, 'pre' is normally what is used.
column_init_method: 'xavier' # IGNORED if linear_adapter is used, options: xavier, zero or normal
row_init_method: 'zero' # IGNORED if linear_adapter is used, options: xavier, zero or normal
norm_type: 'mixedfusedlayernorm' # IGNORED if layer_adapter is used, options are ['layernorm', 'mixedfusedlayernorm']
layer_selection: null # selects in which layers to add adapters, e.g. [1,12] will add adapters to layer 1 (lowest) and 12. null will apply adapters to all layers
weight_tying: False
position_embedding_strategy: null # used only when weight_tying is True

lora_tuning:
adapter_dim: 32
adapter_dropout: 0.0
column_init_method: 'xavier' # IGNORED if linear_adapter is used, options: xavier, zero or normal
row_init_method: 'zero' # IGNORED if linear_adapter is used, options: xavier, zero or normal
layer_selection: null # selects in which layers to add lora adapters. e.g. [1,12] will add lora to layer 1 (lowest) and 12. null will apply adapters to all layers
weight_tying: False
position_embedding_strategy: null # used only when weight_tying is True

# Used for p-tuning peft training
p_tuning:
virtual_tokens: 10 # The number of virtual tokens the prompt encoder should add at the start of the sequence
bottleneck_dim: 1024 # the size of the prompt encoder mlp bottleneck
embedding_dim: 1024 # the size of the prompt encoder embeddings
init_std: 0.023

ia3_tuning:
layer_selection: null # selects in which layers to add ia3 adapters. e.g. [1,12] will add lora to layer 1 (lowest) and 12. null will apply adapters to all layers

data:
test_ds:
src_file_name: ${data_dir}/squad_data/v1.1/train-v1.1_src.txt
tgt_file_name: ${data_dir}/squad_data/v1.1/train-v1.1_tgt.txt
test_names:
- squad
global_batch_size: 512
micro_batch_size: 64
shuffle: False
num_workers: 4
pin_memory: True
max_seq_length: 2048
min_seq_length: 1
drop_last: False
context_key: 'input'
label_key: 'output'
add_eos: True
add_sep: False
add_bos: False
max_src_seq_length: 512
max_tgt_seq_length: 128
write_predictions_to_file: True
output_file_path_prefix: ${evaluation.run.results_dir}/results/predictions
truncation_field: "context"
index_mapping_dir: null
prompt_template: "{input} {output}"
tokens_to_generate: 20
truncation_method: 'right'

metric:
name: "exact_string_match" # Name of the evaluation metric to use.
average: null # Average the metric over the dataset. Options: ['macro', 'micro']. Works only for 'F1', 'accuracy' etc. Refer to torchmetrics for metrics where this is supported.
num_classes: null

inference:
greedy: True # Whether or not to use sampling ; use greedy decoding otherwise
top_k: 0 # The number of highest probability vocabulary tokens to keep for top-k-filtering.
top_p: 0.9 # If set to float < 1, only the most probable tokens with probabilities that add up to top_p or higher are kept for generation.
temperature: 1.0 # sampling temperature
all_probs: False # whether return the log prob for all the tokens in vocab
repetition_penalty: 1.2 # The parameter for repetition penalty. 1.0 means no penalty.
min_tokens_to_generate: 0 # The minimum length of the sequence to be generated.
compute_logprob: False # a flag used to compute logprob of all the input text, a very special case of running inference, default False
outfile_path: output.txt
compute_attention_mask: True

# server-related configs
server: False # whether launch the API server
port: 5555 # the port number for the inference server
web_server: False # whether launch the web inference server
share: True # whether create a public URL
username: test # user name for web client
password: test2 # password for web client
web_port: 9889 # the port number of the web server 1058
chat: False # use the chat interface
chatbot_config:
value: False # whether to inject the value attributes
attributes:
- name: Quality
min: 0
max: 4
key: quality
type: int
default: 4
- name: Toxicity
min: 0
max: 4
key: toxcity
type: int
default: 0
- name: Humor
min: 0
max: 4
key: humor
type: int
default: 0
- name: Creativity
min: 0
max: 4
key: creativity
type: int
default: 0
- name: Violence
min: 0
max: 4
key: violence
type: int
default: 0
- name: Helpfulness
min: 0
max: 4
key: helpfulness
type: int
default: 4
- name: Not_Appropriate
min: 0
max: 4
key: not_appropriate
type: int
default: 0
- name: Language
choices: ['ar', 'bg', 'bn', 'ca', 'cs', 'da', 'de', 'el', 'en', 'eo', 'es', 'eu', 'fa', 'fi', 'fr', 'gl', 'he', 'hu', 'id', 'it', 'ja', 'ko', 'nb', 'nl', 'pl', 'pt', 'ro', 'ru', 'sk', 'sv', 'th', 'tr', 'uk', 'vi', 'zh']
key: lang
type: list
default: en

user: User
assistant: Assistant
system: "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\n\n"



21 changes: 14 additions & 7 deletions launcher_scripts/conf/peft/t5/squad.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ trainer:
devices: 8
accelerator: gpu
num_nodes: 1
precision: bf16
precision: 16
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is the precision meant to be 16 here?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@thomasdhc Yes, it is meant to be 16.
In our PEFT experiments, we didn't get to test thoroughly with bf16 yet, only with 16.

logger: False # logger provided by exp_manager
enable_checkpointing: False
use_distributed_sampler: False
Expand Down Expand Up @@ -58,7 +58,7 @@ model:
tensor_model_parallel_size: 1 # intra-layer model parallelism
pipeline_model_parallel_size: 1 # inter-layer model parallelism

global_batch_size: 128
global_batch_size: 32
micro_batch_size: 4
restore_from_path: ??? # Path to an existing .nemo model you wish to add new tasks to or run inference with
resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc.
Expand Down Expand Up @@ -131,8 +131,10 @@ model:
# - /path/to/boolq.jsonl
# Example of how each dataset is formatted
# {'input': 'John von Neumann\nVon Neumann made fundamental contributions .... Q: What did the math of artificial viscosity do?', 'output': 'smoothed the shock transition without sacrificing basic physics'}
file_names:
- ${data_dir}/squad_data/v1.1/train-v1.1_gpt.json # Path to a list of JSONL files corresponding to the source data.
src_file_name: ${data_dir}/squad_data/v1.1/train-v1.1_src.txt
tgt_file_name: ${data_dir}/squad_data/v1.1/train-v1.1_tgt.txt
max_src_seq_length: 512
max_tgt_seq_length: 128
global_batch_size: ${peft.model.global_batch_size}
micro_batch_size: ${peft.model.micro_batch_size}
shuffle: True
Expand Down Expand Up @@ -160,8 +162,10 @@ model:
prompt_template: "{input} {output}" # fstring to use for assistant prompt. Example: "Q: {input}\nA: {output}"

validation_ds:
file_names:
- ${data_dir}/squad_data/v1.1/dev-v1.1_gpt.json # Path to a list of JSONL files corresponding to the source data. Data format is identical to train_ds.
src_file_name: ${data_dir}/squad_data/v1.1/dev-v1.1_src.txt
tgt_file_name: ${data_dir}/squad_data/v1.1/dev-v1.1_tgt.txt
max_src_seq_length: 512
max_tgt_seq_length: 128
names:
- ${peft.run.task_name} # Names of the corresponding datasets used to log metrics.
global_batch_size: ${peft.model.global_batch_size}
Expand Down Expand Up @@ -190,7 +194,10 @@ model:
average: null # Average the metric over the dataset. Options: ['macro', 'micro']. Works only for 'F1', 'accuracy' etc. Refer to torchmetrics for metrics where this is supported.
num_classes: null
test_ds:
file_names: ${peft.model.data.validation_ds.file_names} # Path to a list of JSONL files corresponding to the source data. Data format is identical to train_ds.
src_file_name: ${data_dir}/squad_data/v1.1/dev-v1.1_src.txt
tgt_file_name: ${data_dir}/squad_data/v1.1/dev-v1.1_tgt.txt
max_src_seq_length: 512
max_tgt_seq_length: 128
names: null # Names of the corresponding datasets used to log metrics.
global_batch_size: ${peft.model.global_batch_size}
micro_batch_size: ${peft.model.micro_batch_size}
Expand Down
6 changes: 3 additions & 3 deletions launcher_scripts/conf/training/mt5/11b.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -53,9 +53,9 @@ exp_manager:
buffer_size: 5

model:
mcore_t5: False
transformer_engine: False
mcore_t5: True
transformer_engine: True

# model parallelism
micro_batch_size: 24
global_batch_size: 1920 # will use more micro batches to reach global batch size
Expand Down
6 changes: 3 additions & 3 deletions launcher_scripts/conf/training/mt5/170m.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -53,9 +53,9 @@ exp_manager:
buffer_size: 5

model:
mcore_t5: False
transformer_engine: False
mcore_t5: True
transformer_engine: True

# model parallelism
micro_batch_size: 64
global_batch_size: 2048 # will use more micro batches to reach global batch size
Expand Down
6 changes: 3 additions & 3 deletions launcher_scripts/conf/training/mt5/23b.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -53,9 +53,9 @@ exp_manager:
buffer_size: 5

model:
mcore_t5: False
transformer_engine: False
mcore_t5: True
transformer_engine: True

# model parallelism
micro_batch_size: 8
global_batch_size: 1920 # will use more micro batches to reach global batch size
Expand Down
6 changes: 3 additions & 3 deletions launcher_scripts/conf/training/mt5/390m.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -53,9 +53,9 @@ exp_manager:
buffer_size: 5

model:
mcore_t5: False
transformer_engine: False
mcore_t5: True
transformer_engine: True

# model parallelism
micro_batch_size: 32
global_batch_size: 2048 # will use more micro batches to reach global batch size
Expand Down
6 changes: 3 additions & 3 deletions launcher_scripts/conf/training/mt5/3b.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -53,9 +53,9 @@ exp_manager:
buffer_size: 5

model:
mcore_t5: False
transformer_engine: False
mcore_t5: True
transformer_engine: True

# model parallelism
micro_batch_size: 24
global_batch_size: 1920 # will use more micro batches to reach global batch size
Expand Down
6 changes: 3 additions & 3 deletions launcher_scripts/conf/training/t5/11b.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -51,9 +51,9 @@ exp_manager:
buffer_size: 5

model:
mcore_t5: False
transformer_engine: False
mcore_t5: True
transformer_engine: True

# model parallelism
micro_batch_size: 24
global_batch_size: 1920 # will use more micro batches to reach global batch size
Expand Down
4 changes: 2 additions & 2 deletions launcher_scripts/conf/training/t5/220m.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -51,8 +51,8 @@ exp_manager:
buffer_size: 5

model:
mcore_t5: False
transformer_engine: False
mcore_t5: True
transformer_engine: True

# model parallelism
micro_batch_size: 64
Expand Down
6 changes: 3 additions & 3 deletions launcher_scripts/conf/training/t5/23b.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -51,9 +51,9 @@ exp_manager:
buffer_size: 5

model:
mcore_t5: False
transformer_engine: False
mcore_t5: True
transformer_engine: True

# model parallelism
micro_batch_size: 8
global_batch_size: 1920 # will use more micro batches to reach global batch size
Expand Down
6 changes: 3 additions & 3 deletions launcher_scripts/conf/training/t5/3b.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -51,9 +51,9 @@ exp_manager:
buffer_size: 5

model:
mcore_t5: False
transformer_engine: False
mcore_t5: True
transformer_engine: True

# model parallelism
micro_batch_size: 24
global_batch_size: 1920 # will use more micro batches to reach global batch size
Expand Down
6 changes: 3 additions & 3 deletions launcher_scripts/conf/training/t5/41b.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -51,9 +51,9 @@ exp_manager:
buffer_size: 5

model:
mcore_t5: False
transformer_engine: False
mcore_t5: True
transformer_engine: True

# model parallelism
micro_batch_size: 6
global_batch_size: 1920 # will use more micro batches to reach global batch size
Expand Down
1 change: 1 addition & 0 deletions launcher_scripts/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,7 @@
"peft_mistral",
"peft_mixtral",
"peft_qwen2",
"peft_t5",
],
DiffusionModelEvaluation: ["stable_diffusion", "imagen"],
},
Expand Down
Loading
Loading