From 2e3563ebe6dbe0cb9c81cb9cd94cbc25af41996d Mon Sep 17 00:00:00 2001 From: Hongbin Liu Date: Wed, 12 Jul 2023 05:29:12 -0700 Subject: [PATCH 01/62] add data preparation for llama, fix pile downloading url --- .gitignore | 1 + launcher_scripts/conf/cluster/bcm.yaml | 12 +++++----- launcher_scripts/conf/config.yaml | 13 ++++++----- .../gpt3/download_gpt3_pile.yaml | 2 +- .../llama/download_llama_pile.yaml | 23 +++++++++++++++++++ launcher_scripts/main.py | 2 +- .../pile_dataprep/conf/config.yaml | 2 ++ .../pile_dataprep/preprocess.py | 10 ++++++++ .../nemo_launcher/core/data_stages.py | 2 ++ 9 files changed, 53 insertions(+), 14 deletions(-) create mode 100755 launcher_scripts/conf/data_preparation/llama/download_llama_pile.yaml diff --git a/.gitignore b/.gitignore index b254c682e8..4517784c6b 100644 --- a/.gitignore +++ b/.gitignore @@ -4,6 +4,7 @@ #*.ipynb output result +data *.pt tests/data/asr .DS_Store diff --git a/launcher_scripts/conf/cluster/bcm.yaml b/launcher_scripts/conf/cluster/bcm.yaml index e1f7b32c6b..8ff05b1fe3 100755 --- a/launcher_scripts/conf/cluster/bcm.yaml +++ b/launcher_scripts/conf/cluster/bcm.yaml @@ -1,9 +1,9 @@ -partition: null -account: null -exclusive: True +partition: luna +account: devtech +exclusive: true gpus_per_task: null -gpus_per_node: 8 +gpus_per_node: null mem: 0 -job_name_prefix: "nemo-megatron-" +job_name_prefix: 'devtech-gpt:' srun_args: - - "--no-container-mount-home" + - --no-container-mount-home diff --git a/launcher_scripts/conf/config.yaml b/launcher_scripts/conf/config.yaml index 50ada47bdc..1fed0ca7ed 100755 --- a/launcher_scripts/conf/config.yaml +++ b/launcher_scripts/conf/config.yaml @@ -1,7 +1,7 @@ defaults: - _self_ - cluster: bcm # Leave it as bcm even if using bcp. It will be ignored for bcp. - - data_preparation: gpt3/download_gpt3_pile + - data_preparation: llama/download_llama_pile - training: gpt3/5b - conversion: gpt3/convert_gpt3 - fine_tuning: null @@ -20,13 +20,14 @@ hydra: debug: False stages: - - training - - conversion - - evaluation - - export + - data_preparation + #- training + #- conversion + #- evaluation + #- export cluster_type: bcm # bcm or bcp. If bcm, it must match - cluster above. -launcher_scripts_path: ??? # Path to NeMo Megatron Launch scripts, should ends with /launcher_scripts +launcher_scripts_path: /lustre/fsw/devtech/hpc-devtech/hongbinl/nemo_megatron/scripts/support_llama/NeMo-Megatron-Launcher/launcher_scripts # Path to NeMo Megatron Launch scripts, should ends with /launcher_scripts data_dir: ${launcher_scripts_path}/data # Location to store and read the data. base_results_dir: ${launcher_scripts_path}/results # Location to store the results, checkpoints and logs. container_mounts: # List of additional paths to mount to container. They will be mounted to same path. diff --git a/launcher_scripts/conf/data_preparation/gpt3/download_gpt3_pile.yaml b/launcher_scripts/conf/data_preparation/gpt3/download_gpt3_pile.yaml index 632ccdadd2..ab6614480a 100755 --- a/launcher_scripts/conf/data_preparation/gpt3/download_gpt3_pile.yaml +++ b/launcher_scripts/conf/data_preparation/gpt3/download_gpt3_pile.yaml @@ -9,7 +9,7 @@ run: dataset: pile download_the_pile: True # Whether to download the pile dataset from the internet. -the_pile_url: "https://mystic.the-eye.eu/public/AI/pile/train/" # Source URL to download The Pile dataset from. +the_pile_url: "https://the-eye.eu/public/AI/pile/train/" # Source URL to download The Pile dataset from. file_numbers: "0-29" # The pile dataset consists of 30 files (0-29), choose which ones to download. preprocess_data: True # True to preprocess the data from a jsonl file, False otherwise. download_vocab_url: "https://huggingface.co/gpt2/resolve/main/vocab.json" # URL to download the vocab from. diff --git a/launcher_scripts/conf/data_preparation/llama/download_llama_pile.yaml b/launcher_scripts/conf/data_preparation/llama/download_llama_pile.yaml new file mode 100755 index 0000000000..cc23a234a7 --- /dev/null +++ b/launcher_scripts/conf/data_preparation/llama/download_llama_pile.yaml @@ -0,0 +1,23 @@ +run: + name: download_llama_pile + results_dir: ${base_results_dir}/${.name} + time_limit: "1:00:00" + dependency: "singleton" + node_array_size: 30 + array: ${..file_numbers} + bcp_preproc_npernode: 2 # 2 should be safe to use and x2 times faster. + +dataset: pile +download_the_pile: True # Whether to download the pile dataset from the internet. +the_pile_url: "https://the-eye.eu/public/AI/pile/train/" # Source URL to download The Pile dataset from. +file_numbers: "0-1" # The pile dataset consists of 30 files (0-29), choose which ones to download. +preprocess_data: True # True to preprocess the data from a jsonl file, False otherwise. +#download_vocab_url: "https://huggingface.co/gpt2/resolve/main/vocab.json" # URL to download the vocab from. +#download_merges_url: "https://huggingface.co/gpt2/resolve/main/merges.txt" # URL to download the merges from. +#vocab_save_dir: ${data_dir}/bpe +#merges_save_dir: ${data_dir}/bpe +#tokenizer_type: GPT2BPETokenizer +tokenizer_library: "sentencepiece" +tokenizer_model: "/lustre/fsw/devtech/hpc-devtech/hongbinl/nemo_megatron/checkpoints/llama/7B/tokenizer.model" +rm_downloaded: False # Extract script will remove downloaded zst after extraction +rm_extracted: False # Preprocess script will remove extracted files after preproc. diff --git a/launcher_scripts/main.py b/launcher_scripts/main.py index 4053328f2c..25d5a016ca 100755 --- a/launcher_scripts/main.py +++ b/launcher_scripts/main.py @@ -47,7 +47,7 @@ NeMoEvaluation: ["t5", "mt5", "prompt_t5", "prompt_mt5", "adapter_t5", "adapter_gpt3", "ia3_t5", "ia3_gpt3"], }, "data_preparation": { - PileDataPreparation: ["gpt3", "t5", "bert"], + PileDataPreparation: ["gpt3", "t5", "bert", "llama"], MC4DataPreparation: ["mt5"], CustomDataPreparation: ["generic"], }, diff --git a/launcher_scripts/nemo_launcher/collections/dataprep_scripts/pile_dataprep/conf/config.yaml b/launcher_scripts/nemo_launcher/collections/dataprep_scripts/pile_dataprep/conf/config.yaml index 448dbb1dbc..14917628a8 100755 --- a/launcher_scripts/nemo_launcher/collections/dataprep_scripts/pile_dataprep/conf/config.yaml +++ b/launcher_scripts/nemo_launcher/collections/dataprep_scripts/pile_dataprep/conf/config.yaml @@ -9,3 +9,5 @@ rm_extracted: True tokenizer_type: null vocab_save_dir: null merges_save_dir: null +tokenizer_library: null +tokenizer_model: null diff --git a/launcher_scripts/nemo_launcher/collections/dataprep_scripts/pile_dataprep/preprocess.py b/launcher_scripts/nemo_launcher/collections/dataprep_scripts/pile_dataprep/preprocess.py index 61a9e36560..44ef368c1b 100755 --- a/launcher_scripts/nemo_launcher/collections/dataprep_scripts/pile_dataprep/preprocess.py +++ b/launcher_scripts/nemo_launcher/collections/dataprep_scripts/pile_dataprep/preprocess.py @@ -28,6 +28,8 @@ def main(cfg): data_dir = cfg.get("data_dir") rm_extracted = cfg.get("rm_extracted") tokenizer_type = cfg.get("tokenizer_type") + tokenizer_library = cfg.get("tokenizer_library") + tokenizer_model = cfg.get("tokenizer_model") assert data_dir is not None, "data_dir must be a valid path" # Vocab @@ -67,6 +69,8 @@ def main(cfg): model_type = 'bert' elif 'gpt3' in data_config: model_type = 'gpt3' + elif 'llama' in data_config: + model_type = 'llama' output_prefix = os.path.join(data_dir, f"my-{model_type}_{file_number:02d}") @@ -77,6 +81,8 @@ def main(cfg): f"--dataset-impl mmap " f"--tokenizer-library megatron " f"--tokenizer-type {tokenizer_type} " + f"--tokenizer-library {tokenizer_library} " + f"--tokenizer-model {tokenizer_model} " f"--workers $SLURM_CPUS_ON_NODE " ) @@ -119,6 +125,8 @@ def main(cfg): model_type = 'bert' elif 'gpt3' in data_config: model_type = 'gpt3' + elif 'llama' in data_config: + model_type = 'llama' output_prefix = os.path.join(data_dir, f"my-{model_type}_{file_number:02d}") @@ -129,6 +137,8 @@ def main(cfg): f"--dataset-impl mmap " f"--tokenizer-library megatron " f"--tokenizer-type {tokenizer_type} " + f"--tokenizer-library {tokenizer_library} " + f"--tokenizer-model {tokenizer_model} " f"--workers {ncpus} " ) diff --git a/launcher_scripts/nemo_launcher/core/data_stages.py b/launcher_scripts/nemo_launcher/core/data_stages.py index c3713786e5..7158750900 100755 --- a/launcher_scripts/nemo_launcher/core/data_stages.py +++ b/launcher_scripts/nemo_launcher/core/data_stages.py @@ -252,6 +252,8 @@ def _make_sub_stage_command(self, sub_stage: str) -> List[str]: rm_downloaded=self.stage_cfg.get("rm_downloaded"), rm_extracted=self.stage_cfg.get("rm_extracted"), tokenizer_type=self.stage_cfg.get("tokenizer_type"), + tokenizer_library=self.stage_cfg.get("tokenizer_library", "megatron"), + tokenizer_model=self.stage_cfg.get("tokenizer_model", None), vocab_save_dir=self.stage_cfg.get("vocab_save_dir"), merges_save_dir=self.stage_cfg.get("merges_save_dir"), ) From 69f9066085d20e064e668c455881c37ef58b2c67 Mon Sep 17 00:00:00 2001 From: Hongbin Liu Date: Thu, 13 Jul 2023 00:32:09 -0700 Subject: [PATCH 02/62] add training for llama, add download_tokenizer_url for data preparation --- launcher_scripts/conf/config.yaml | 8 +- .../llama/download_llama_pile.yaml | 4 +- launcher_scripts/conf/training/llama/13b.yaml | 157 +++++++++++++ launcher_scripts/conf/training/llama/30b.yaml | 156 +++++++++++++ launcher_scripts/conf/training/llama/65b.yaml | 157 +++++++++++++ launcher_scripts/conf/training/llama/7b.yaml | 221 ++++++++++++++++++ .../nemo_launcher/core/data_stages.py | 9 + launcher_scripts/nemo_launcher/core/stages.py | 6 +- 8 files changed, 712 insertions(+), 6 deletions(-) create mode 100644 launcher_scripts/conf/training/llama/13b.yaml create mode 100644 launcher_scripts/conf/training/llama/30b.yaml create mode 100644 launcher_scripts/conf/training/llama/65b.yaml create mode 100755 launcher_scripts/conf/training/llama/7b.yaml diff --git a/launcher_scripts/conf/config.yaml b/launcher_scripts/conf/config.yaml index 1fed0ca7ed..f9242678d9 100755 --- a/launcher_scripts/conf/config.yaml +++ b/launcher_scripts/conf/config.yaml @@ -2,7 +2,7 @@ defaults: - _self_ - cluster: bcm # Leave it as bcm even if using bcp. It will be ignored for bcp. - data_preparation: llama/download_llama_pile - - training: gpt3/5b + - training: llama/30b - conversion: gpt3/convert_gpt3 - fine_tuning: null - prompt_learning: null @@ -20,8 +20,8 @@ hydra: debug: False stages: - - data_preparation - #- training + #- data_preparation + - training #- conversion #- evaluation #- export @@ -34,7 +34,7 @@ container_mounts: # List of additional paths to mount to container. They will be - null container: nvcr.io/ea-bignlp/nemofw-training:23.05-py3 -wandb_api_key_file: null # File where the w&B api key is stored. Key must be on the first line. +wandb_api_key_file: /lustre/fsw/devtech/hpc-devtech/hongbinl/nemo_megatron/NeMo-Megatron-Launcher/wandb_api_key # File where the w&B api key is stored. Key must be on the first line. env_vars: NCCL_TOPO_FILE: null # Should be a path to an XML file describing the topology diff --git a/launcher_scripts/conf/data_preparation/llama/download_llama_pile.yaml b/launcher_scripts/conf/data_preparation/llama/download_llama_pile.yaml index cc23a234a7..863f817661 100755 --- a/launcher_scripts/conf/data_preparation/llama/download_llama_pile.yaml +++ b/launcher_scripts/conf/data_preparation/llama/download_llama_pile.yaml @@ -12,12 +12,14 @@ download_the_pile: True # Whether to download the pile dataset from the interne the_pile_url: "https://the-eye.eu/public/AI/pile/train/" # Source URL to download The Pile dataset from. file_numbers: "0-1" # The pile dataset consists of 30 files (0-29), choose which ones to download. preprocess_data: True # True to preprocess the data from a jsonl file, False otherwise. +download_tokenizer_url: "https://huggingface.co/decapoda-research/llama-7b-hf/resolve/main/tokenizer.model" #download_vocab_url: "https://huggingface.co/gpt2/resolve/main/vocab.json" # URL to download the vocab from. #download_merges_url: "https://huggingface.co/gpt2/resolve/main/merges.txt" # URL to download the merges from. #vocab_save_dir: ${data_dir}/bpe #merges_save_dir: ${data_dir}/bpe #tokenizer_type: GPT2BPETokenizer tokenizer_library: "sentencepiece" -tokenizer_model: "/lustre/fsw/devtech/hpc-devtech/hongbinl/nemo_megatron/checkpoints/llama/7B/tokenizer.model" +tokenizer_save_dir: ${data_dir}/llama +tokenizer_model: ${.tokenizer_save_dir}/llama_tokenizer.model rm_downloaded: False # Extract script will remove downloaded zst after extraction rm_extracted: False # Preprocess script will remove extracted files after preproc. diff --git a/launcher_scripts/conf/training/llama/13b.yaml b/launcher_scripts/conf/training/llama/13b.yaml new file mode 100644 index 0000000000..cf6f8ec8cc --- /dev/null +++ b/launcher_scripts/conf/training/llama/13b.yaml @@ -0,0 +1,157 @@ +run: + name: llama_13b + results_dir: ${base_results_dir}/${.name} + time_limit: 0-02:00:00 + dependency: singleton +trainer: + num_nodes: 4 + devices: 8 + accelerator: gpu + precision: bf16 + logger: false + enable_checkpointing: false + replace_sampler_ddp: false + max_epochs: null + max_steps: 300000 + max_time: '5:23:30:00' + log_every_n_steps: 10 + val_check_interval: 2000 + limit_val_batches: 50 + limit_test_batches: 50 + accumulate_grad_batches: 1 + gradient_clip_val: 1.0 +exp_manager: + explicit_log_dir: ${training.run.results_dir}/results + exp_dir: null + name: megatron_llama + create_wandb_logger: true + wandb_logger_kwargs: + project: nemo_llama + name: ${training.run.name} + resume_if_exists: true + resume_ignore_no_checkpoint: true + create_checkpoint_callback: false + checkpoint_callback_params: + monitor: val_loss + save_top_k: 10 + mode: min + always_save_nemo: false + save_nemo_on_train_end: false + filename: megatron_llama--{val_loss:.2f}-{step}-{consumed_samples} + model_parallel_size: ${multiply:${training.model.tensor_model_parallel_size}, + ${training.model.pipeline_model_parallel_size}} + log_step_timing: true + step_timing_kwargs: + sync_cuda: true + buffer_size: 5 +model: + micro_batch_size: 2 + global_batch_size: 2048 + rampup_batch_size: null + tensor_model_parallel_size: 4 + pipeline_model_parallel_size: 1 + virtual_pipeline_model_parallel_size: null + encoder_seq_length: 2048 + max_position_embeddings: 2048 + num_layers: 40 + hidden_size: 5120 + ffn_hidden_size: 13824 + num_attention_heads: 40 + init_method_std: 0.01 + use_scaled_init_method: true + hidden_dropout: 0.0 + attention_dropout: 0.0 + ffn_dropout: 0.0 + kv_channels: null + apply_query_key_layer_scaling: true + normalization: rmsnorm + layernorm_epsilon: 1.0e-05 + do_layer_norm_weight_decay: false + make_vocab_size_divisible_by: 128 + pre_process: true + post_process: true + persist_layer_norm: true + bias: false + activation: fast-swiglu + headscale: false + transformer_block_type: pre_ln + openai_gelu: false + normalize_attention_scores: true + position_embedding_type: rope + rotary_percentage: 1.0 + attention_type: multihead + share_embeddings_and_output_weights: false + tokenizer: + library: sentencepiece + type: null + model: ${data_dir}/llama/llama_tokenizer.model + delimiter: null + vocab_file: null + merge_file: null + sentencepiece_legacy: false + native_amp_init_scale: 4294967296 + native_amp_growth_interval: 1000 + hysteresis: 2 + fp32_residual_connection: false + fp16_lm_cross_entropy: false + megatron_amp_O2: true + grad_allreduce_chunk_size_mb: 125 + grad_div_ar_fusion: true + gradient_accumulation_fusion: true + bias_activation_fusion: false + bias_dropout_add_fusion: false + masked_softmax_fusion: true + seed: 1234 + resume_from_checkpoint: null + use_cpu_initialization: false + onnx_safe: false + apex_transformer_log_level: 30 + gradient_as_bucket_view: true + sync_batch_comm: false + activations_checkpoint_granularity: selective + activations_checkpoint_method: block + activations_checkpoint_num_layers: 0 + num_micro_batches_with_partial_activation_checkpoints: 0 + activations_checkpoint_layers_per_pipeline: 0 + sequence_parallel: false + transformer_engine: false + fp8: false + fp8_e4m3: false + fp8_hybrid: false + fp8_margin: 0 + fp8_interval: 1 + fp8_amax_history_len: 1 + fp8_amax_compute_algo: most_recent + use_emha: false + ub_tp_comm_overlap: false + use_flash_attention: false + optim: + name: distributed_fused_adam + lr: 0.0001 + weight_decay: 0.1 + betas: + - 0.9 + - 0.95 + bucket_cap_mb: 125 + overlap_grad_sync: false + sched: + name: CosineAnnealing + warmup_steps: 107 + constant_steps: 11873 + min_lr: 1.0e-05 + data: + data_impl: mmap + splits_string: 99990,8,2 + seq_length: 2048 + skip_warmup: true + num_workers: 2 + dataloader_type: single + reset_position_ids: false + reset_attention_mask: false + eod_mask_loss: false + index_mapping_dir: null + data_prefix: + - 0.5 + - ${data_dir}/my-llama_00_text_document + - 0.5 + - ${data_dir}/my-llama_01_text_document diff --git a/launcher_scripts/conf/training/llama/30b.yaml b/launcher_scripts/conf/training/llama/30b.yaml new file mode 100644 index 0000000000..33ed5054c8 --- /dev/null +++ b/launcher_scripts/conf/training/llama/30b.yaml @@ -0,0 +1,156 @@ +run: + name: llama_30b + results_dir: ${base_results_dir}/${.name} + time_limit: 0-01:00:00 + dependency: singleton +trainer: + num_nodes: 16 + devices: 8 + accelerator: gpu + precision: bf16 + logger: false + enable_checkpointing: false + replace_sampler_ddp: false + max_epochs: null + max_steps: 300000 + max_time: '19:23:30:00' + log_every_n_steps: 10 + val_check_interval: 2000 + limit_val_batches: 50 + limit_test_batches: 50 + accumulate_grad_batches: 1 + gradient_clip_val: 1.0 +exp_manager: + explicit_log_dir: ${training.run.results_dir}/results + exp_dir: null + name: megatron_llama + create_wandb_logger: true + wandb_logger_kwargs: + project: nemo_llama + name: ${training.run.name} + resume_if_exists: true + resume_ignore_no_checkpoint: true + create_checkpoint_callback: true + checkpoint_callback_params: + monitor: val_loss + save_top_k: 10 + mode: min + always_save_nemo: false + save_nemo_on_train_end: false + filename: megatron_llama--{val_loss:.2f}-{step}-{consumed_samples} + model_parallel_size: ${multiply:${training.model.tensor_model_parallel_size}, ${training.model.pipeline_model_parallel_size}} + log_step_timing: true + step_timing_kwargs: + sync_cuda: true + buffer_size: 5 +model: + micro_batch_size: 1 + global_batch_size: 2048 + rampup_batch_size: null + tensor_model_parallel_size: 4 + pipeline_model_parallel_size: 4 + virtual_pipeline_model_parallel_size: null + encoder_seq_length: 2048 + max_position_embeddings: 2048 + num_layers: 60 + hidden_size: 6656 + ffn_hidden_size: 17920 + num_attention_heads: 52 + init_method_std: 0.008944 + use_scaled_init_method: true + hidden_dropout: 0.0 + attention_dropout: 0.0 + ffn_dropout: 0.0 + kv_channels: null + apply_query_key_layer_scaling: true + normalization: rmsnorm + layernorm_epsilon: 1.0e-05 + do_layer_norm_weight_decay: false + make_vocab_size_divisible_by: 128 + pre_process: true + post_process: true + persist_layer_norm: true + bias: false + activation: fast-swiglu + headscale: false + transformer_block_type: pre_ln + openai_gelu: false + normalize_attention_scores: true + position_embedding_type: rope + rotary_percentage: 1.0 + attention_type: multihead + share_embeddings_and_output_weights: false + tokenizer: + library: sentencepiece + type: null + model: ${data_dir}/llama/llama_tokenizer.model + delimiter: null + vocab_file: null + merge_file: null + sentencepiece_legacy: false + native_amp_init_scale: 4294967296 + native_amp_growth_interval: 1000 + hysteresis: 2 + fp32_residual_connection: false + fp16_lm_cross_entropy: false + megatron_amp_O2: true + grad_allreduce_chunk_size_mb: 125 + grad_div_ar_fusion: true + gradient_accumulation_fusion: true + bias_activation_fusion: false + bias_dropout_add_fusion: false + masked_softmax_fusion: true + seed: 1234 + resume_from_checkpoint: null + use_cpu_initialization: false + onnx_safe: false + apex_transformer_log_level: 30 + gradient_as_bucket_view: true + sync_batch_comm: false + activations_checkpoint_granularity: selective + activations_checkpoint_method: block + activations_checkpoint_num_layers: 0 + num_micro_batches_with_partial_activation_checkpoints: 2 + activations_checkpoint_layers_per_pipeline: 32 + sequence_parallel: false + transformer_engine: false + fp8: false + fp8_e4m3: false + fp8_hybrid: false + fp8_margin: 0 + fp8_interval: 1 + fp8_amax_history_len: 1 + fp8_amax_compute_algo: most_recent + use_emha: false + ub_tp_comm_overlap: false + use_flash_attention: false + optim: + name: fused_adam + lr: 0.0001 + weight_decay: 0.1 + betas: + - 0.9 + - 0.95 + #bucket_cap_mb: 125 + #overlap_grad_sync: false + sched: + name: CosineAnnealing + warmup_steps: 107 + constant_steps: 11873 + min_lr: 1.0e-05 + data: + data_impl: mmap + splits_string: 99990,8,2 + seq_length: 2048 + skip_warmup: true + num_workers: 2 + dataloader_type: single + reset_position_ids: false + reset_attention_mask: false + eod_mask_loss: false + index_mapping_dir: null + data_prefix: + - .5 + - ${data_dir}/my-llama_00_text_document + - .5 + - ${data_dir}/my-llama_01_text_document diff --git a/launcher_scripts/conf/training/llama/65b.yaml b/launcher_scripts/conf/training/llama/65b.yaml new file mode 100644 index 0000000000..464af39c09 --- /dev/null +++ b/launcher_scripts/conf/training/llama/65b.yaml @@ -0,0 +1,157 @@ +run: + name: llama_65b + results_dir: ${base_results_dir}/${.name} + time_limit: 0-01:00:00 + dependency: singleton +trainer: + num_nodes: 16 + devices: 8 + accelerator: gpu + precision: bf16 + logger: false + enable_checkpointing: false + replace_sampler_ddp: false + max_epochs: null + max_steps: 300000 + max_time: '19:23:30:00' + log_every_n_steps: 10 + val_check_interval: 2000 + limit_val_batches: 50 + limit_test_batches: 50 + accumulate_grad_batches: 1 + gradient_clip_val: 1.0 +exp_manager: + explicit_log_dir: ${training.run.results_dir}/results + exp_dir: null + name: megatron_llama + create_wandb_logger: true + wandb_logger_kwargs: + project: nemo_llama + name: ${training.run.name} + resume_if_exists: true + resume_ignore_no_checkpoint: true + create_checkpoint_callback: True + checkpoint_callback_params: + monitor: val_loss + save_top_k: 10 + mode: min + always_save_nemo: false + save_nemo_on_train_end: false + filename: megatron_llama--{val_loss:.2f}-{step}-{consumed_samples} + model_parallel_size: ${multiply:${training.model.tensor_model_parallel_size}, ${training.model.pipeline_model_parallel_size}} + log_step_timing: true + step_timing_kwargs: + sync_cuda: true + buffer_size: 5 +model: + micro_batch_size: 1 + global_batch_size: 2048 + rampup_batch_size: null + tensor_model_parallel_size: 4 + pipeline_model_parallel_size: 8 + virtual_pipeline_model_parallel_size: 10 + encoder_seq_length: 2048 + max_position_embeddings: 2048 + num_layers: 80 + hidden_size: 8192 + ffn_hidden_size: 22016 + num_attention_heads: 64 + init_method_std: 0.008944 + use_scaled_init_method: true + hidden_dropout: 0.0 + attention_dropout: 0.0 + ffn_dropout: 0.0 + kv_channels: null + apply_query_key_layer_scaling: true + normalization: rmsnorm + layernorm_epsilon: 1.0e-05 + do_layer_norm_weight_decay: false + make_vocab_size_divisible_by: 128 + pre_process: true + post_process: true + persist_layer_norm: true + bias: false + activation: fast-swiglu + headscale: false + transformer_block_type: pre_ln + openai_gelu: false + normalize_attention_scores: true + position_embedding_type: rope + rotary_percentage: 1.0 + attention_type: multihead + share_embeddings_and_output_weights: false + tokenizer: + library: sentencepiece + type: null + model: ${data_dir}/llama/llama_tokenizer.model + delimiter: null + vocab_file: null + merge_file: null + sentencepiece_legacy: false + native_amp_init_scale: 4294967296 + native_amp_growth_interval: 1000 + hysteresis: 2 + fp32_residual_connection: false + fp16_lm_cross_entropy: false + megatron_amp_O2: true + grad_allreduce_chunk_size_mb: 125 + grad_div_ar_fusion: true + gradient_accumulation_fusion: true + bias_activation_fusion: false + bias_dropout_add_fusion: false + masked_softmax_fusion: true + seed: 1234 + resume_from_checkpoint: null + use_cpu_initialization: false + onnx_safe: false + apex_transformer_log_level: 30 + gradient_as_bucket_view: true + sync_batch_comm: false + activations_checkpoint_granularity: selective + activations_checkpoint_method: block + activations_checkpoint_num_layers: 0 + num_micro_batches_with_partial_activation_checkpoints: 80 + activations_checkpoint_layers_per_pipeline: 0 + sequence_parallel: false + transformer_engine: false + fp8: false + fp8_e4m3: false + fp8_hybrid: false + fp8_margin: 0 + fp8_interval: 1 + fp8_amax_history_len: 1 + fp8_amax_compute_algo: most_recent + use_emha: false + ub_tp_comm_overlap: false + use_flash_attention: false + optim: + name: fused_adam + lr: 0.0001 + weight_decay: 0.1 + betas: + - 0.9 + - 0.95 + #bucket_cap_mb: 125 + #overlap_grad_sync: false + sched: + name: CosineAnnealing + warmup_steps: 107 + constant_steps: 11873 + min_lr: 1.0e-05 + data: + data_impl: mmap + splits_string: 99990,8,2 + seq_length: 2048 + skip_warmup: true + num_workers: 2 + dataloader_type: single + reset_position_ids: false + reset_attention_mask: false + eod_mask_loss: false + index_mapping_dir: null + data_prefix: + - .5 + - ${data_dir}/my-llama_00_text_document + - .5 + - ${data_dir}/my-llama_01_text_document + diff --git a/launcher_scripts/conf/training/llama/7b.yaml b/launcher_scripts/conf/training/llama/7b.yaml new file mode 100755 index 0000000000..fcc4aa58da --- /dev/null +++ b/launcher_scripts/conf/training/llama/7b.yaml @@ -0,0 +1,221 @@ +run: + name: llama_7b + results_dir: ${base_results_dir}/${.name} + time_limit: "0-02:00:00" + dependency: "singleton" +trainer: + num_nodes: 4 + devices: 8 + accelerator: gpu + precision: bf16 + logger: False # logger provided by exp_manager + enable_checkpointing: False + replace_sampler_ddp: False + max_epochs: null + max_steps: 300000 # consumed_samples = global_step * global_batch_size + max_time: "05:23:30:00" # days:hours:minutes:seconds + log_every_n_steps: 10 + val_check_interval: 2000 + limit_val_batches: 50 + limit_test_batches: 50 + accumulate_grad_batches: 1 + gradient_clip_val: 1.0 +exp_manager: + explicit_log_dir: ${training.run.results_dir}/results + exp_dir: null + name: megatron_llama + create_wandb_logger: True + wandb_logger_kwargs: + project: nemo_llama + name: ${training.run.name} + resume_if_exists: True + resume_ignore_no_checkpoint: True + create_checkpoint_callback: True + checkpoint_callback_params: + monitor: val_loss + save_top_k: 10 + mode: min + always_save_nemo: False # saves nemo file during validation, not implemented for model parallel + save_nemo_on_train_end: False # not recommended when training large models on clusters with short time limits + filename: 'megatron_llama--{val_loss:.2f}-{step}-{consumed_samples}' + model_parallel_size: ${multiply:${training.model.tensor_model_parallel_size}, ${training.model.pipeline_model_parallel_size}} + log_step_timing: True + step_timing_kwargs: + sync_cuda: True + buffer_size: 5 + +model: + micro_batch_size: 2 + global_batch_size: 2048 + rampup_batch_size: null + tensor_model_parallel_size: 2 + pipeline_model_parallel_size: 1 + virtual_pipeline_model_parallel_size: null + encoder_seq_length: 2048 + max_position_embeddings: 2048 + num_layers: 32 + hidden_size: 4096 + ffn_hidden_size: 11008 + num_attention_heads: 32 + init_method_std: 0.01 + use_scaled_init_method: true + hidden_dropout: 0.0 + attention_dropout: 0.0 + ffn_dropout: 0.0 + kv_channels: null + apply_query_key_layer_scaling: true + normalization: rmsnorm + layernorm_epsilon: 1.0e-05 + do_layer_norm_weight_decay: false + make_vocab_size_divisible_by: 128 + pre_process: true + post_process: true + persist_layer_norm: true + bias: false + activation: fast-swiglu + headscale: false + transformer_block_type: pre_ln + openai_gelu: false + normalize_attention_scores: true + position_embedding_type: rope + rotary_percentage: 1.0 + attention_type: multihead + share_embeddings_and_output_weights: false + tokenizer: + library: 'sentencepiece' + type: null + model: ${data_dir}/llama/llama_tokenizer.model + delimiter: null + vocab_file: null + merge_file: null + sentencepiece_legacy: False + native_amp_init_scale: 4294967296 + native_amp_growth_interval: 1000 + hysteresis: 2 + fp32_residual_connection: false + fp16_lm_cross_entropy: false + megatron_amp_O2: true + grad_allreduce_chunk_size_mb: 125 + grad_div_ar_fusion: true + gradient_accumulation_fusion: true + bias_activation_fusion: false + bias_dropout_add_fusion: false + masked_softmax_fusion: true + seed: 1234 + resume_from_checkpoint: null + use_cpu_initialization: false + onnx_safe: false + apex_transformer_log_level: 30 + gradient_as_bucket_view: true + sync_batch_comm: false + activations_checkpoint_granularity: selective + activations_checkpoint_method: block + activations_checkpoint_num_layers: 0 + num_micro_batches_with_partial_activation_checkpoints: null + activations_checkpoint_layers_per_pipeline: null + sequence_parallel: false # does not support sequence parallel + + ## Transformer Engine + # fp8 training is currently not supported in the improved models + transformer_engine: False + fp8: False # enables fp8 in TransformerLayer forward + fp8_e4m3: False # sets fp8_format = recipe.Format.E4M3 + fp8_hybrid: False # sets fp8_format = recipe.Format.HYBRID + fp8_margin: 0 # scaling margin + fp8_interval: 1 # scaling update interval + fp8_amax_history_len: 1024 # Number of steps for which amax history is recorded per tensor + fp8_amax_compute_algo: max # 'most_recent' or 'max'. Algorithm for computing amax from history + use_emha: False + ub_tp_comm_overlap: False + use_flash_attention: false + optim: + name: distributed_fused_adam + lr: 1e-4 + weight_decay: 0.1 + betas: + - 0.9 + - 0.95 + bucket_cap_mb: 125 + overlap_grad_sync: False + sched: + name: CosineAnnealing + warmup_steps: 500 + constant_steps: 0 + min_lr: 1e-5 + data: + data_impl: mmap + splits_string: "99990,8,2" + seq_length: 2048 + skip_warmup: true + num_workers: 2 + dataloader_type: single + reset_position_ids: false + reset_attention_mask: false + eod_mask_loss: false + index_mapping_dir: null + data_prefix: + - .5 + - ${data_dir}/my-llama_00_text_document + - .5 + - ${data_dir}/my-llama_01_text_document + # - .0333 + # - ${data_dir}/my-gpt3_00_text_document + # - .0333 + # - ${data_dir}/my-gpt3_01_text_document + # - .0333 + # - ${data_dir}/my-gpt3_02_text_document + # - .0333 + # - ${data_dir}/my-gpt3_03_text_document + # - .0333 + # - ${data_dir}/my-gpt3_04_text_document + # - .0333 + # - ${data_dir}/my-gpt3_05_text_document + # - .0333 + # - ${data_dir}/my-gpt3_06_text_document + # - .0333 + # - ${data_dir}/my-gpt3_07_text_document + # - .0333 + # - ${data_dir}/my-gpt3_08_text_document + # - .0333 + # - ${data_dir}/my-gpt3_09_text_document + # - .0333 + # - ${data_dir}/my-gpt3_10_text_document + # - .0333 + # - ${data_dir}/my-gpt3_11_text_document + # - .0333 + # - ${data_dir}/my-gpt3_12_text_document + # - .0333 + # - ${data_dir}/my-gpt3_13_text_document + # - .0333 + # - ${data_dir}/my-gpt3_14_text_document + # - .0333 + # - ${data_dir}/my-gpt3_15_text_document + # - .0333 + # - ${data_dir}/my-gpt3_16_text_document + # - .0333 + # - ${data_dir}/my-gpt3_17_text_document + # - .0333 + # - ${data_dir}/my-gpt3_18_text_document + # - .0333 + # - ${data_dir}/my-gpt3_19_text_document + # - .0333 + # - ${data_dir}/my-gpt3_20_text_document + # - .0333 + # - ${data_dir}/my-gpt3_21_text_document + # - .0333 + # - ${data_dir}/my-gpt3_22_text_document + # - .0333 + # - ${data_dir}/my-gpt3_23_text_document + # - .0333 + # - ${data_dir}/my-gpt3_24_text_document + # - .0333 + # - ${data_dir}/my-gpt3_25_text_document + # - .0333 + # - ${data_dir}/my-gpt3_26_text_document + # - .0333 + # - ${data_dir}/my-gpt3_27_text_document + # - .0333 + # - ${data_dir}/my-gpt3_28_text_document + # - .0334 + # - ${data_dir}/my-gpt3_29_text_document + diff --git a/launcher_scripts/nemo_launcher/core/data_stages.py b/launcher_scripts/nemo_launcher/core/data_stages.py index 7158750900..989a06b263 100755 --- a/launcher_scripts/nemo_launcher/core/data_stages.py +++ b/launcher_scripts/nemo_launcher/core/data_stages.py @@ -176,6 +176,15 @@ def setup_folder_and_data(self) -> None: download_merges_url = data_cfg.get("download_merges_url") vocab_save_dir = data_cfg.get("vocab_save_dir") merges_save_dir = data_cfg.get("merges_save_dir") + download_tokenizer_url = data_cfg.get("download_tokenizer_url") + tokenizer_save_dir = data_cfg.get("tokenizer_save_dir") + + if download_tokenizer_url is not None: + assert tokenizer_save_dir is not None, "tokenizer_save_dir must be a valid path." + download_single_file( + url=download_tokenizer_url, save_dir=tokenizer_save_dir, file_name="llama_tokenizer.model", + ) + # Download vocab if download_vocab_url is not None: assert vocab_save_dir is not None, "vocab_save_dir must be a valid path." diff --git a/launcher_scripts/nemo_launcher/core/stages.py b/launcher_scripts/nemo_launcher/core/stages.py index 4acd81daa7..b0319ae7ff 100755 --- a/launcher_scripts/nemo_launcher/core/stages.py +++ b/launcher_scripts/nemo_launcher/core/stages.py @@ -620,6 +620,7 @@ def _get_nemo_code_path(self, model_type: str) -> Path: "t5": self._nemo_code_path / "examples/nlp/language_modeling/megatron_t5_pretraining.py", "mt5": self._nemo_code_path / "examples/nlp/language_modeling/megatron_t5_pretraining.py", "gpt3": self._nemo_code_path / "examples/nlp/language_modeling/megatron_gpt_pretraining.py", + "llama": self._nemo_code_path / "examples/nlp/language_modeling/megatron_gpt_pretraining.py", "bert": self._nemo_code_path / "examples/nlp/language_modeling/megatron_bert_pretraining.py", } return model_type_to_code_path[model_type] @@ -683,7 +684,7 @@ def _get_nemo_code_path(self, model_type: str) -> Path: :return: path current stage's essential nemo scripts code :rtype: Path """ - if model_type == "gpt3": + if model_type == "gpt3" or model_type == "llama": raise NotImplementedError("Fine-tuning is not supported in NeMo Megatron GPT-3 models.") model_type_to_code_path = { "t5": self._nemo_code_path / "examples/nlp/language_modeling/megatron_t5_seq2seq_finetune.py", @@ -725,6 +726,7 @@ def _get_nemo_code_path(self, model_type: str) -> Path: """ model_type_to_code_path = { "gpt3": self._nemo_code_path / "examples/nlp/language_modeling/megatron_gpt_prompt_learning.py", + "llama": self._nemo_code_path / "examples/nlp/language_modeling/megatron_gpt_prompt_learning.py", "t5": self._nemo_code_path / "examples/nlp/language_modeling/megatron_t5_prompt_learning.py", "mt5": self._nemo_code_path / "examples/nlp/language_modeling/megatron_t5_prompt_learning.py", } @@ -748,6 +750,7 @@ def _get_nemo_code_path(self, model_type: str) -> Path: """ model_type_to_code_path = { "gpt3": self._nemo_code_path / "examples/nlp/language_modeling/tuning/megatron_gpt_adapter_tuning.py", + "llama": self._nemo_code_path / "examples/nlp/language_modeling/tuning/megatron_gpt_adapter_tuning.py", "t5": self._nemo_code_path / "examples/nlp/language_modeling/tuning/megatron_t5_adapter_tuning.py", } return model_type_to_code_path[model_type] @@ -770,6 +773,7 @@ def _get_nemo_code_path(self, model_type: str) -> Path: """ model_type_to_code_path = { "gpt3": self._nemo_code_path / "examples/nlp/language_modeling/tuning/megatron_gpt_ia3_tuning.py", + "llama": self._nemo_code_path / "examples/nlp/language_modeling/tuning/megatron_gpt_ia3_tuning.py", "t5": self._nemo_code_path / "examples/nlp/language_modeling/tuning/megatron_t5_ia3_tuning.py", } return model_type_to_code_path[model_type] From 9a757fbaff326dd91efc0014b9c417cfb183eff6 Mon Sep 17 00:00:00 2001 From: Hongbin Liu Date: Thu, 13 Jul 2023 02:14:33 -0700 Subject: [PATCH 03/62] add conversion for llama --- launcher_scripts/conf/config.yaml | 6 +++--- .../conf/conversion/llama/convert_llama.yaml | 21 +++++++++++++++++++ 2 files changed, 24 insertions(+), 3 deletions(-) create mode 100755 launcher_scripts/conf/conversion/llama/convert_llama.yaml diff --git a/launcher_scripts/conf/config.yaml b/launcher_scripts/conf/config.yaml index f9242678d9..6ca1791c0b 100755 --- a/launcher_scripts/conf/config.yaml +++ b/launcher_scripts/conf/config.yaml @@ -3,7 +3,7 @@ defaults: - cluster: bcm # Leave it as bcm even if using bcp. It will be ignored for bcp. - data_preparation: llama/download_llama_pile - training: llama/30b - - conversion: gpt3/convert_gpt3 + - conversion: llama/convert_llama - fine_tuning: null - prompt_learning: null - adapter_learning: null @@ -21,8 +21,8 @@ debug: False stages: #- data_preparation - - training - #- conversion + #- training + - conversion #- evaluation #- export diff --git a/launcher_scripts/conf/conversion/llama/convert_llama.yaml b/launcher_scripts/conf/conversion/llama/convert_llama.yaml new file mode 100755 index 0000000000..ba8018ee85 --- /dev/null +++ b/launcher_scripts/conf/conversion/llama/convert_llama.yaml @@ -0,0 +1,21 @@ +run: + name: convert_${conversion.run.model_train_name} + nodes: ${divide_ceil:${conversion.model.model_parallel_size}, 8} # 8 gpus per node + time_limit: "1:00:00" + dependency: "singleton" + ntasks_per_node: ${divide_ceil:${conversion.model.model_parallel_size}, ${.nodes}} + convert_name: convert_nemo + model_train_name: llama_7b + train_dir: ${base_results_dir}/${.model_train_name} + results_dir: ${.train_dir}/${.convert_name} + nemo_file_name: megatron_llama.nemo # name of nemo checkpoint; must be .nemo file + +model: + model_type: gpt # gpt or t5, use t5 for mt5 as well + checkpoint_folder: ${conversion.run.train_dir}/results/checkpoints + checkpoint_name: latest # latest OR name pattern of a checkpoint (e.g. megatron_llama-*last.ckpt) + hparams_file: ${conversion.run.train_dir}/results/hparams.yaml + tensor_model_parallel_size: 2 + pipeline_model_parallel_size: 1 + model_parallel_size: ${multiply:${.tensor_model_parallel_size}, ${.pipeline_model_parallel_size}} + tokenizer_model: ${data_dir}/llama/llama_tokenizer.model From 6a004fd62731f1118ec76d461aed59dc04096aa5 Mon Sep 17 00:00:00 2001 From: Hongbin Liu Date: Wed, 19 Jul 2023 00:13:35 -0700 Subject: [PATCH 04/62] add convertion/PEFT/Evaluation for llama Signed-off-by: Hongbin Liu --- .../conf/adapter_learning/llama/squad.yaml | 107 +++++++++ launcher_scripts/conf/config.yaml | 23 +- .../conf/conversion/llama/convert_llama.yaml | 6 +- .../conf/evaluation/llama/evaluate_all.yaml | 24 ++ .../conf/evaluation/prompt_llama/squad.yaml | 21 ++ .../conf/ia3_learning/llama/squad.yaml | 98 ++++++++ .../conf/prompt_learning/llama/squad.yaml | 111 +++++++++ launcher_scripts/conf/training/llama/7b.yaml | 2 +- launcher_scripts/main.py | 2 +- .../dolly_dataprep/download.py | 2 +- .../collections/eval_harness/evaluate.py | 8 +- .../eval_harness/lm_eval/models/__init__.py | 4 +- .../eval_harness/lm_eval/models/nemo_llama.py | 218 ++++++++++++++++++ .../lm_eval/models/nemo_llama_prompt.py | 174 ++++++++++++++ launcher_scripts/nemo_launcher/core/stages.py | 9 +- 15 files changed, 788 insertions(+), 21 deletions(-) create mode 100755 launcher_scripts/conf/adapter_learning/llama/squad.yaml create mode 100755 launcher_scripts/conf/evaluation/llama/evaluate_all.yaml create mode 100755 launcher_scripts/conf/evaluation/prompt_llama/squad.yaml create mode 100755 launcher_scripts/conf/ia3_learning/llama/squad.yaml create mode 100755 launcher_scripts/conf/prompt_learning/llama/squad.yaml create mode 100755 launcher_scripts/nemo_launcher/collections/eval_harness/lm_eval/models/nemo_llama.py create mode 100755 launcher_scripts/nemo_launcher/collections/eval_harness/lm_eval/models/nemo_llama_prompt.py diff --git a/launcher_scripts/conf/adapter_learning/llama/squad.yaml b/launcher_scripts/conf/adapter_learning/llama/squad.yaml new file mode 100755 index 0000000000..9907d52635 --- /dev/null +++ b/launcher_scripts/conf/adapter_learning/llama/squad.yaml @@ -0,0 +1,107 @@ +run: + name: ${.task_name}_${.model_train_name} + time_limit: "04:00:00" + dependency: "singleton" + convert_name: convert_nemo + model_train_name: llama_7b + convert_dir: ${base_results_dir}/${adapter_learning.run.model_train_name}/${adapter_learning.run.convert_name} + task_name: "squad" + results_dir: ${base_results_dir}/${.model_train_name}/adapter_learning_${.task_name} + +trainer: + devices: 8 + num_nodes: 1 + accelerator: gpu + precision: bf16 + logger: False + enable_checkpointing: False + replace_sampler_ddp: False + max_epochs: 4 + max_steps: -1 + log_every_n_steps: 10 + val_check_interval: 0.1 + accumulate_grad_batches: 1 + gradient_clip_val: 1.0 + resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc. + + +exp_manager: + explicit_log_dir: ${adapter_learning.run.results_dir}/results + exp_dir: null + name: megatron_llama_adapter + create_wandb_logger: True + wandb_logger_kwargs: + project: nemo_llama_adapter + name: ${adapter_learning.run.name} + resume_if_exists: True + resume_ignore_no_checkpoint: True + create_checkpoint_callback: True + checkpoint_callback_params: + monitor: val_loss + save_top_k: 5 + mode: min + save_nemo_on_train_end: False + filename: "megatron_llama_adapter_learn--{val_loss:.3f}-{step}" + model_parallel_size: ${adapter_learning.model.model_parallel_size} + save_best_model: True + +model: + seed: 1234 + nemo_path: ${adapter_learning.run.results_dir}/results/megatron_gpt_adapter.nemo # .nemo filename/absolute path to where the virtual prompt model parameters will be saved + virtual_prompt_style: 'no-prompts' # adapter tuning requires no virtual prompts + encoder_seq_length: 2048 + gradient_as_bucket_view: false + tensor_model_parallel_size: 4 # intra-layer model parallelism + pipeline_model_parallel_size: 1 # inter-layer model parallelism + model_parallel_size: ${multiply:${.tensor_model_parallel_size}, ${.pipeline_model_parallel_size}} + global_batch_size: 64 + micro_batch_size: 8 + + restore_path: null # Path to an existing adapter .nemo model you wish to add new tasks to or run inference with + language_model_path: ${adapter_learning.run.convert_dir}/results/megatron_llama.nemo # Path to the GPT language model .nemo file, always required + existing_tasks: [] # List of tasks the model has already been p-tuned/prompt-tuned for, needed when a restore path is given + new_tasks: ["squad"] # List of new tasknames to be prompt-tuned + + task_templates: # Add more/replace tasks as needed, these are just examples + - taskname: "squad" + prompt_template: "context: {context} question: {question} answer: {answer}" + total_virtual_tokens: 0 + virtual_token_splits: [] + truncate_field: null + answer_only_loss: True + answer_field: "answer" + + adapter_tuning: + type: 'parallel_adapter' # this should be either 'parallel_adapter' or 'linear_adapter' + adapter_dim: 16 + adapter_dropout: 0.1 + norm_position: 'pre' # This can be set to 'pre' or 'post', 'pre' is normally what is used. + column_init_method: 'xavier' # IGNORED if linear_adapter is used, options: xavier, zero or normal + row_init_method: 'zero' # IGNORED if linear_adapter is used, options: xavier, zero or normal + norm_type: 'mixedfusedlayernorm' # IGNORED if layer_adapter is used, options are ['layernorm', 'mixedfusedlayernorm'] + + data: + train_ds: + - ${data_dir}/prompt_data/v1.1/squad_train.jsonl # multiple prompt dataset can be given at the same time + validation_ds: + - ${data_dir}/prompt_data/v1.1/squad_val.jsonl + add_eos: True + shuffle: True + num_workers: 4 + pin_memory: True + + + optim: + name: fused_adam + lr: 1e-4 + weight_decay: 0.01 + betas: + - 0.9 + - 0.98 + sched: + name: CosineAnnealing + warmup_steps: 50 + constant_steps: 0 # Constant steps should also be 0 when min_lr=0 + min_lr: 0.0 # min_lr must be 0.0 for prompt learning + monitor: val_loss + reduce_on_plateau: false diff --git a/launcher_scripts/conf/config.yaml b/launcher_scripts/conf/config.yaml index 6ca1791c0b..9ee398f347 100755 --- a/launcher_scripts/conf/config.yaml +++ b/launcher_scripts/conf/config.yaml @@ -2,13 +2,14 @@ defaults: - _self_ - cluster: bcm # Leave it as bcm even if using bcp. It will be ignored for bcp. - data_preparation: llama/download_llama_pile - - training: llama/30b + - training: llama/7b - conversion: llama/convert_llama - fine_tuning: null - - prompt_learning: null - - adapter_learning: null - - ia3_learning: null - - evaluation: gpt3/evaluate_all + - prompt_learning: llama/squad + - adapter_learning: llama/squad + - ia3_learning: llama/squad + #- evaluation: llama/evaluate_all + - evaluation: prompt_llama/squad - export: gpt3/export_gpt3 - override hydra/job_logging: stdout @@ -20,15 +21,19 @@ hydra: debug: False stages: - #- data_preparation - #- training + #- data_preparation + #- training - conversion - #- evaluation - #- export + #- prompt_learning + #- adapter_learning + #- ia3_learning + #- evaluation + #- export cluster_type: bcm # bcm or bcp. If bcm, it must match - cluster above. launcher_scripts_path: /lustre/fsw/devtech/hpc-devtech/hongbinl/nemo_megatron/scripts/support_llama/NeMo-Megatron-Launcher/launcher_scripts # Path to NeMo Megatron Launch scripts, should ends with /launcher_scripts data_dir: ${launcher_scripts_path}/data # Location to store and read the data. +nemo_dir: /lustre/fsw/devtech/hpc-devtech/hongbinl/nemo_megatron/nemo_repo/internal/NeMo base_results_dir: ${launcher_scripts_path}/results # Location to store the results, checkpoints and logs. container_mounts: # List of additional paths to mount to container. They will be mounted to same path. - null diff --git a/launcher_scripts/conf/conversion/llama/convert_llama.yaml b/launcher_scripts/conf/conversion/llama/convert_llama.yaml index ba8018ee85..451d916b20 100755 --- a/launcher_scripts/conf/conversion/llama/convert_llama.yaml +++ b/launcher_scripts/conf/conversion/llama/convert_llama.yaml @@ -8,13 +8,13 @@ run: model_train_name: llama_7b train_dir: ${base_results_dir}/${.model_train_name} results_dir: ${.train_dir}/${.convert_name} - nemo_file_name: megatron_llama.nemo # name of nemo checkpoint; must be .nemo file + nemo_file_name: megatron_llama_prompt.nemo # name of nemo checkpoint; must be .nemo file model: model_type: gpt # gpt or t5, use t5 for mt5 as well - checkpoint_folder: ${conversion.run.train_dir}/results/checkpoints + checkpoint_folder: ${conversion.run.train_dir}/prompt_learning_squad/results/checkpoints checkpoint_name: latest # latest OR name pattern of a checkpoint (e.g. megatron_llama-*last.ckpt) - hparams_file: ${conversion.run.train_dir}/results/hparams.yaml + hparams_file: ${conversion.run.train_dir}/prompt_learning_squad/results/hparams.yaml tensor_model_parallel_size: 2 pipeline_model_parallel_size: 1 model_parallel_size: ${multiply:${.tensor_model_parallel_size}, ${.pipeline_model_parallel_size}} diff --git a/launcher_scripts/conf/evaluation/llama/evaluate_all.yaml b/launcher_scripts/conf/evaluation/llama/evaluate_all.yaml new file mode 100755 index 0000000000..ca4d9b7456 --- /dev/null +++ b/launcher_scripts/conf/evaluation/llama/evaluate_all.yaml @@ -0,0 +1,24 @@ +run: + name: ${.eval_name}_${.model_train_name} + time_limit: "01:00:00" + dependency: "singleton" + nodes: ${divide_ceil:${evaluation.model.model_parallel_size}, 8} # 8 gpus per node + ntasks_per_node: ${divide_ceil:${evaluation.model.model_parallel_size}, ${.nodes}} + eval_name: eval_all + model_train_name: llama_7b + train_dir: ${base_results_dir}/${.model_train_name} + tasks: all_tasks # supported: lambada, boolq, race, piqa, hellaswag, winogrande, wikitext2, wikitext103 OR all_tasks + results_dir: ${base_results_dir}/${.model_train_name}/${.eval_name} + +model: + model_type: nemo-llama + nemo_model: null # run eval with a .nemo file, produced when converted interleaved checkpoints + checkpoint_folder: ${evaluation.run.train_dir}/results/checkpoints + checkpoint_name: latest # latest OR name pattern of a checkpoint (e.g. megatron_gpt-*last.ckpt) + hparams_file: ${evaluation.run.train_dir}/results/hparams.yaml + tensor_model_parallel_size: 2 + pipeline_model_parallel_size: 1 + model_parallel_size: ${multiply:${.tensor_model_parallel_size}, ${.pipeline_model_parallel_size}} + precision: bf16 # must match training precision - 32, 16 or bf16 + eval_batch_size: 4 + tokenizer_model: ${data_dir}/llama/llama_tokenizer.model diff --git a/launcher_scripts/conf/evaluation/prompt_llama/squad.yaml b/launcher_scripts/conf/evaluation/prompt_llama/squad.yaml new file mode 100755 index 0000000000..7890e97eab --- /dev/null +++ b/launcher_scripts/conf/evaluation/prompt_llama/squad.yaml @@ -0,0 +1,21 @@ +run: + name: ${.eval_name}_${.model_train_name} + time_limit: "1:00:00" + nodes: ${divide_ceil:${evaluation.model.model_parallel_size}, 8} # 8 gpus per node + ntasks_per_node: ${divide_ceil:${evaluation.model.model_parallel_size}, ${.nodes}} + eval_name: eval_prompt_squad + model_train_name: llama_7b + tasks: "prompt" # general prompt task + prompt_learning_dir: ${base_results_dir}/${.model_train_name}/prompt_learning_squad # assume prompt learning was on squad task + results_dir: ${base_results_dir}/${.model_train_name}/${.eval_name} + +model: + model_type: nemo-llama-prompt + nemo_model: ${evaluation.run.prompt_learning_dir}/results/megatron_llama_prompt.nemo + tensor_model_parallel_size: 2 #1 for 126m, 2 for 5b, 8 for 20b + pipeline_model_parallel_size: 1 + model_parallel_size: ${multiply:${.tensor_model_parallel_size}, ${.pipeline_model_parallel_size}} + precision: bf16 # must match training precision - 32, 16 or bf16 + eval_batch_size: 4 + prompt_dataset_paths: ${data_dir}/prompt_data/v1.1/squad_val.jsonl + disable_special_tokens: False # Whether to disable virtual tokens in prompt model evaluation. This is equivalent to evaluate without prompt-/p-tuning. diff --git a/launcher_scripts/conf/ia3_learning/llama/squad.yaml b/launcher_scripts/conf/ia3_learning/llama/squad.yaml new file mode 100755 index 0000000000..01c22b6f02 --- /dev/null +++ b/launcher_scripts/conf/ia3_learning/llama/squad.yaml @@ -0,0 +1,98 @@ +run: + name: ${.task_name}_${.model_train_name} + time_limit: "04:00:00" + dependency: "singleton" + convert_name: convert_nemo + model_train_name: llama_7b + convert_dir: ${base_results_dir}/${ia3_learning.run.model_train_name}/${ia3_learning.run.convert_name} + task_name: "squad" + results_dir: ${base_results_dir}/${.model_train_name}/ia3_learning_${.task_name} + +trainer: + devices: 8 + num_nodes: 1 + accelerator: gpu + precision: bf16 + logger: False + enable_checkpointing: False + replace_sampler_ddp: False + max_epochs: 4 + max_steps: -1 + log_every_n_steps: 10 + val_check_interval: 0.1 + accumulate_grad_batches: 1 + gradient_clip_val: 1.0 + resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc. + + +exp_manager: + explicit_log_dir: ${ia3_learning.run.results_dir}/results + exp_dir: null + name: megatron_llama_ia3 + create_wandb_logger: True + wandb_logger_kwargs: + project: nemo_llama_ia3 + name: ${ia3_learning.run.name} + resume_if_exists: True + resume_ignore_no_checkpoint: True + create_checkpoint_callback: True + checkpoint_callback_params: + monitor: val_loss + save_top_k: 5 + mode: min + save_nemo_on_train_end: False + filename: "megatron_gpt_ia3_learn--{val_loss:.3f}-{step}" + model_parallel_size: ${ia3_learning.model.model_parallel_size} + save_best_model: True + +model: + seed: 1234 + nemo_path: ${ia3_learning.run.results_dir}/results/megatron_llama_ia3.nemo # .nemo filename/absolute path to where the virtual prompt model parameters will be saved + virtual_prompt_style: 'no-prompts' # ia3 tuning requires no virtual prompts + encoder_seq_length: 2048 + gradient_as_bucket_view: false + tensor_model_parallel_size: 2 # intra-layer model parallelism + pipeline_model_parallel_size: 1 # inter-layer model parallelism + model_parallel_size: ${multiply:${.tensor_model_parallel_size}, ${.pipeline_model_parallel_size}} + global_batch_size: 64 + micro_batch_size: 8 + + restore_path: null # Path to an existing ia3 .nemo model you wish to add new tasks to or run inference with + language_model_path: ${ia3_learning.run.convert_dir}/results/megatron_llama.nemo # Path to the GPT language model .nemo file, always required + existing_tasks: [] # List of tasks the model has already been p-tuned/prompt-tuned for, needed when a restore path is given + new_tasks: ["squad"] # List of new tasknames to be prompt-tuned + + task_templates: # Add more/replace tasks as needed, these are just examples + - taskname: "squad" + prompt_template: "context: {context} question: {question} answer: {answer}" + total_virtual_tokens: 0 + virtual_token_splits: [] + truncate_field: null + answer_only_loss: True + answer_field: "answer" + + data: + train_ds: + - ${data_dir}/prompt_data/v1.1/squad_train.jsonl # multiple prompt dataset can be given at the same time + validation_ds: + - ${data_dir}/prompt_data/v1.1/squad_val.jsonl + add_eos: True + shuffle: True + num_workers: 4 + pin_memory: True + + + optim: + name: fused_adam + lr: 1e-4 + weight_decay: 0.01 + betas: + - 0.9 + - 0.98 + sched: + name: CosineAnnealing + warmup_steps: 50 + constant_steps: 0 # Constant steps should also be 0 when min_lr=0 + min_lr: 0.0 # min_lr must be 0.0 for prompt learning + monitor: val_loss + reduce_on_plateau: false diff --git a/launcher_scripts/conf/prompt_learning/llama/squad.yaml b/launcher_scripts/conf/prompt_learning/llama/squad.yaml new file mode 100755 index 0000000000..51104ba17d --- /dev/null +++ b/launcher_scripts/conf/prompt_learning/llama/squad.yaml @@ -0,0 +1,111 @@ +run: + name: ${.task_name}_${.model_train_name} + time_limit: "01:00:00" + dependency: "singleton" + convert_name: convert_nemo + model_train_name: llama_7b + convert_dir: ${base_results_dir}/${prompt_learning.run.model_train_name}/${prompt_learning.run.convert_name} + task_name: "squad" + results_dir: ${base_results_dir}/${.model_train_name}/prompt_learning_${.task_name} + +trainer: + devices: 8 + num_nodes: 1 + accelerator: gpu + precision: bf16 + logger: False + enable_checkpointing: False + replace_sampler_ddp: False + max_epochs: 4 + max_steps: -1 + log_every_n_steps: 10 + val_check_interval: 200 + accumulate_grad_batches: 1 + gradient_clip_val: 1.0 + resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc. + +exp_manager: + explicit_log_dir: ${prompt_learning.run.results_dir}/results + exp_dir: null + name: megatron_llama_prompt + create_wandb_logger: True + wandb_logger_kwargs: + project: nemo_llama_prompt + name: ${prompt_learning.run.name} + resume_if_exists: True + resume_ignore_no_checkpoint: True + create_checkpoint_callback: True + checkpoint_callback_params: + monitor: val_loss + save_top_k: 5 + mode: min + save_nemo_on_train_end: False + filename: "megatron_llama_prompt_learn--{val_loss:.3f}-{step}" + model_parallel_size: ${prompt_learning.model.model_parallel_size} + save_best_model: True + +model: + seed: 1234 + nemo_path: ${prompt_learning.run.results_dir}/results/megatron_gpt_prompt.nemo # the place to save prompt learning nemo checkpoint + virtual_prompt_style: 'p-tuning' # One of 'p-tuning', 'prompt-tuning', or 'inference'. We recommend 'p-tuning' over 'prompt-tuning'. + tensor_model_parallel_size: 2 + pipeline_model_parallel_size: 1 + model_parallel_size: ${multiply:${.tensor_model_parallel_size}, ${.pipeline_model_parallel_size}} + encoder_seq_length: 2048 + global_batch_size: 64 + micro_batch_size: 8 + + restore_path: null # used to restore from a prompt tuned checkpoint and add new tasks + language_model_path: ${prompt_learning.run.convert_dir}/results/megatron_llama.nemo # Restore lanugage model from pre-trained .nemo checkpoint + existing_tasks: [] # if restore from a prompt tuned checkpoint and add new tasks, existing task names should be included here. + new_tasks: ["squad"] # multiple tasks can be tuned at the same time + + ## Sequence Parallelism + # Makes tensor parallelism more memory efficient for LLMs (20B+) by parallelizing layer norms and dropout sequentially + # See Reducing Activation Recomputation in Large Transformer Models: https://arxiv.org/abs/2205.05198 for more details. + sequence_parallel: False + + task_templates: # task_templates for all existing_tasks and new_tasks are required. + - taskname: "squad" # The task name + prompt_template: "<|VIRTUAL_PROMPT_0|>Context: {context} Question: {question} Answer: {answer}" # Prompt template for task, specify virtual prompt positions with <|VIRTUAL_PROMPT_#|> + total_virtual_tokens: 10 # Sum of tokens in virtual_token_splits must add to this number. Can differ between new and existing tasks, but must match across all new tasks being tuned at the same time. + virtual_token_splits: [10] # number of virtual tokens to be inserted at each VIRTUAL PROMPT location, must add to total_virtual_tokens + truncate_field: "context" # The {field} in the prompt template whose text will be truncated if the input is too long, if null, inputs that are too long will just be skipped. + answer_field: "answer" # Answer/Target field + answer_only_loss: True # If true, the loss will only be calculated with answer_field text vs. ground truth. If false, the loss will be calculated over entire sentence. + + prompt_learning: # Prompt tunin specific params + new_prompt_init_methods: null # e.g ['text'], List of 'text' or 'random', should correspond to tasks listed in new tasks + new_prompt_init_text: null # e.g ['some init text goes here'], some init text if init method is text, or None if init method is random + + p_tuning: # P-tuning specific params + encoder_type: "tpmlp" # ['tpmlp', 'lstm', 'biglstm', 'mlp'] + dropout: 0.0 + num_layers: 2 # number of layers for MLP or LSTM layers. Note, it has no effect for tpmlp currently as it always assumes it is two layers. + encoder_hidden: 2048 # encoder hidden for biglstm and tpmlp + init_std: 0.023 # init std for tpmlp layers + + data: + train_ds: + - ${data_dir}/prompt_data/v1.1/squad_train.jsonl # multiple prompt dataset can be given at the same time + validation_ds: + - ${data_dir}/prompt_data/v1.1/squad_val.jsonl + add_eos: True + shuffle: True + num_workers: 4 + pin_memory: True + + optim: + name: fused_adam + lr: 2.0e-4 + weight_decay: 0.01 + betas: + - 0.9 + - 0.98 + sched: + name: CosineAnnealing + warmup_steps: 50 + constant_steps: 10 + min_lr: 0.0 # has to be zero + monitor: val_loss + reduce_on_plateau: false diff --git a/launcher_scripts/conf/training/llama/7b.yaml b/launcher_scripts/conf/training/llama/7b.yaml index fcc4aa58da..96cb4790c0 100755 --- a/launcher_scripts/conf/training/llama/7b.yaml +++ b/launcher_scripts/conf/training/llama/7b.yaml @@ -1,7 +1,7 @@ run: name: llama_7b results_dir: ${base_results_dir}/${.name} - time_limit: "0-02:00:00" + time_limit: "0-04:00:00" dependency: "singleton" trainer: num_nodes: 4 diff --git a/launcher_scripts/main.py b/launcher_scripts/main.py index 25d5a016ca..986ec8357d 100755 --- a/launcher_scripts/main.py +++ b/launcher_scripts/main.py @@ -43,7 +43,7 @@ "conversion": Conversion, "export": Export, "evaluation": { - EvalHarnessEvaluation: ["gpt3", "prompt_gpt3"], + EvalHarnessEvaluation: ["gpt3", "prompt_gpt3", "llama", "prompt_llama"], NeMoEvaluation: ["t5", "mt5", "prompt_t5", "prompt_mt5", "adapter_t5", "adapter_gpt3", "ia3_t5", "ia3_gpt3"], }, "data_preparation": { diff --git a/launcher_scripts/nemo_launcher/collections/dataprep_scripts/dolly_dataprep/download.py b/launcher_scripts/nemo_launcher/collections/dataprep_scripts/dolly_dataprep/download.py index de679a6fb9..0ce8d12382 100644 --- a/launcher_scripts/nemo_launcher/collections/dataprep_scripts/dolly_dataprep/download.py +++ b/launcher_scripts/nemo_launcher/collections/dataprep_scripts/dolly_dataprep/download.py @@ -23,7 +23,7 @@ import os from argparse import ArgumentParser -default_link = "https://github.com/databrickslabs/dolly/raw/master/data/databricks-dolly-15k.jsonl" +default_link = "https://huggingface.co/datasets/databricks/databricks-dolly-15k/resolve/main/databricks-dolly-15k.jsonl" def get_file_name(link): file_name = link.split('/')[-1] diff --git a/launcher_scripts/nemo_launcher/collections/eval_harness/evaluate.py b/launcher_scripts/nemo_launcher/collections/eval_harness/evaluate.py index f6a664ef8e..df6c20f27c 100755 --- a/launcher_scripts/nemo_launcher/collections/eval_harness/evaluate.py +++ b/launcher_scripts/nemo_launcher/collections/eval_harness/evaluate.py @@ -85,7 +85,7 @@ def parse_args(parser_main): parser.add_argument("--model", required=True) parser.add_argument( - "--nemo_model", type=str, default=None, required=False, help="Pass path to model's .nemo file", + "--nemo_model", default=None, required=False, help="Pass path to model's .nemo file", ) parser.add_argument( "--checkpoint_folder", @@ -120,6 +120,7 @@ def parse_args(parser_main): parser.add_argument("--vocab_file", default=None) parser.add_argument("--merge_file", default=None) + parser.add_argument("--tokenizer_model", default=None) parser.add_argument( "--prompt_dataset_paths", @@ -292,9 +293,10 @@ def main(): pipeline_model_parallel_size = args.pipeline_model_parallel_size vocab_file = args.vocab_file merge_file = args.merge_file + tokenizer_model = args.tokenizer_model hparams_override_file = None - if args.nemo_model is None: # Not loading from .nemo checkpoint + if args.nemo_model is None or args.nemo_model == "None": # Not loading from .nemo checkpoint # Checkpoint search if checkpoint_name == "latest": checkpoints = os.path.join(checkpoint_folder, "*.ckpt") @@ -322,6 +324,8 @@ def main(): conf.cfg.tokenizer.vocab_file = vocab_file if merge_file is not None: conf.cfg.tokenizer.merge_file = merge_file + if tokenizer_model is not None: + conf.cfg.tokenizer.model = tokenizer_model if "activations_checkpoint_granularity" in conf.cfg: conf.cfg.activations_checkpoint_granularity = None if "activations_checkpoint_method" in conf.cfg: diff --git a/launcher_scripts/nemo_launcher/collections/eval_harness/lm_eval/models/__init__.py b/launcher_scripts/nemo_launcher/collections/eval_harness/lm_eval/models/__init__.py index 8c9a1b5ed2..1b18dc64e5 100755 --- a/launcher_scripts/nemo_launcher/collections/eval_harness/lm_eval/models/__init__.py +++ b/launcher_scripts/nemo_launcher/collections/eval_harness/lm_eval/models/__init__.py @@ -14,11 +14,13 @@ from lm_eval.base import LM -from . import dummy, nemo_gpt3, nemo_gpt3_prompt +from . import dummy, nemo_gpt3, nemo_gpt3_prompt, nemo_llama, nemo_llama_prompt MODEL_REGISTRY = { "nemo-gpt3": nemo_gpt3.NeMo_GPT3LM_TP_PP, + "nemo-llama": nemo_llama.NeMo_LLAMALM_TP_PP, "nemo-gpt3-prompt": nemo_gpt3_prompt.NeMo_GPT3_PROMPTLM, + "nemo-llama-prompt": nemo_llama_prompt.NeMo_LLAMA_PROMPTLM, "dummy": dummy.DummyLM, } diff --git a/launcher_scripts/nemo_launcher/collections/eval_harness/lm_eval/models/nemo_llama.py b/launcher_scripts/nemo_launcher/collections/eval_harness/lm_eval/models/nemo_llama.py new file mode 100755 index 0000000000..462d28f549 --- /dev/null +++ b/launcher_scripts/nemo_launcher/collections/eval_harness/lm_eval/models/nemo_llama.py @@ -0,0 +1,218 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +from omegaconf import OmegaConf, open_dict + +import torch +import tqdm +from megatron.core import parallel_state +from lm_eval import utils +from lm_eval.base import LM +from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel +from nemo.collections.nlp.modules.common.megatron.megatron_init import fake_initialize_model_parallel +from nemo.collections.nlp.modules.common.text_generation_utils import generate, get_computeprob_response +from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategy, NLPSaveRestoreConnector +from nemo.utils import logging +from nemo.utils.app_state import AppState +from nemo.utils.get_rank import is_global_rank_zero +from nemo.utils.model_utils import inject_model_parallel_rank +from pytorch_lightning.trainer.trainer import Trainer +from torch.nn.utils.rnn import pad_sequence +from torch.utils.data import DataLoader, Dataset +from torch.utils.data.dataloader import default_collate + +from .nemo_gpt3 import RequestDataset, setup_trainer_and_model, DDP_initialize + +class NeMo_LLAMALM_TP_PP(LM): + def __init__(self, args, truncate=False, batch_size=1): + super().__init__() + + # get nemo megatron + logging.info(f"**** Building LLaMA model ...") + self.trainer, self.model = setup_trainer_and_model(args) + self.tokenizer = self.model.tokenizer + self.model.eval() + + self.max_length = self.model.cfg.get("max_position_embeddings") + assert self.tokenizer.text_to_ids("hello\n\nhello") == [ + 22172, + 13, + 13, + 12199, + ], "Tokenizer text_to_ids is not working as expected." + + self.truncate = truncate + self.batch_size = batch_size + + # initialize DDP and move model to GPU + DDP_initialize(self.model) + self.model = self.model.cuda() + + @classmethod + def create_from_arg_string(cls, arg_string, additional_config={}): + args = utils.simple_parse_args_string(arg_string) + args2 = {k: v for k, v in additional_config.items() if v is not None} + return cls(args, **args2) + + def loglikelihood(self, requests): + return self._loglikelihood(requests) + + """ + request: (context, continuation) + how this all works: + CTX CONT + inp 0 1 2 3|4 5 6 7 8 9 <- last token is deleted by inp[:, :-1] + gpt2 \ \ + logits 1 2 3|4 5 6 7 8 9 <- the ctx half gets tossed out by the [:, -len(continuation_enc):, :self.VOCAB_SIZE] slice + cont_toks 4 5 6 7 8 9 + when too long to fit in context, truncate from the left + """ + + def _loglikelihood(self, requests): + def pad_collate(batch, eos_id=2): + tokens = [item[0] for item in batch] + conti_lens = [item[1] for item in batch] + lens = [len(token) - 1 for token in tokens] # fake delete last token by reducing input len + max_len = max(lens) + extra_pad_len = 0 + if max_len % 8 != 0: + extra_pad_len = 8 - (max_len % 8) + max_len += extra_pad_len + # extra_pad_len = 2048 - max_len + # max_len += extra_pad_len + + tokens_pad = pad_sequence(tokens, batch_first=False, padding_value=eos_id) + if extra_pad_len > 0: + extra_pad = torch.ones(extra_pad_len, len(batch)) * eos_id + extra_pad = extra_pad.type_as(tokens_pad) + tokens_pad = torch.vstack((tokens_pad, extra_pad)) + # Add padding to all samples to adapt nemo generate api + + new_batch = [] + for token, lenn, conti_len in zip(tokens_pad.T, lens, conti_lens): + # (token, lenn, tokens_to_generate, compute_logprobs) + new_batch.append((token, max_len, lenn, conti_len)) + + new_batch = default_collate(new_batch) + return new_batch + + def _collate(x): # used to reorder request and remove duplications + """ + the negative sign on len(toks) sorts descending - this has a few advantages: + - time estimates will always be over not underestimates, which is more useful for planning + - to know the size of a batch when going through the list, you know the first one is always the batch padded context length. + this is useful to simplify the batching logic and more importantly to make automatic adaptive batches much much easier to implement + - any OOMs will happen right away rather than near the end + """ + toks = x[0] + x[1] + return -len(toks), tuple(toks) + + reord = utils.Reorderer(requests, _collate) + request_ds = RequestDataset(reord.get_reordered(), self.model.tokenizer) + request_dl = DataLoader(request_ds, collate_fn=pad_collate, batch_size=self.batch_size, shuffle=False) + + def logits_to_results(batch, response): + input_token_ids_batch, _, lens, conti_lens = batch + batch_size = len(lens) + assert len(response['token_ids']) == batch_size, "Response's length not equal to batch size." + + batch_res = [] + for index in range(batch_size): + inp_len = lens[index] + conti_len = conti_lens[index] + + inp_token_ids = input_token_ids_batch[index].tolist()[: inp_len + 1] # recover fake deleted token + response_token_ids = response['token_ids'][index][:inp_len] + + assert response_token_ids == inp_token_ids[:-1], f"Mismatch in input tokens." + + log_probs = response['full_logprob'][index][:inp_len] # torch.tensor + log_probs = log_probs[-conti_len:] + + greedy_tokens = log_probs.argmax(dim=-1) + greedy_tokens = self.tokenizer.ids_to_tokens(greedy_tokens.cpu().numpy().tolist()) + + conti_token_ids = inp_token_ids[-conti_len:] + conti_tokens = self.tokenizer.ids_to_tokens(conti_token_ids) + + max_equal = greedy_tokens == conti_tokens + log_probs = log_probs.cpu().to(torch.float32) + conti_enc = torch.tensor(self.tokenizer.tokens_to_ids(conti_tokens)) + conti_probs = torch.gather(log_probs, 1, conti_enc.unsqueeze(-1)).squeeze(-1) + + batch_res.append((float(conti_probs.sum()), bool(max_equal), greedy_tokens, conti_tokens)) + return batch_res + + res = [] + for batch in tqdm.tqdm(request_dl): + # inputs = (token_ids, conti_lens) + inputs = (batch[0].cuda(), batch[1].cuda()) + response = generate( + model=self.model, + inputs=inputs, + tokens_to_generate=1, + all_probs=True, + temperature=1.0, + add_BOS=False, + top_k=0, + top_p=0.9, + greedy=True, + repetition_penalty=1.0, + min_tokens_to_generate=0, + ) + response = get_computeprob_response(self.tokenizer, response, inputs) + + if is_global_rank_zero(): + res.extend(logits_to_results(batch, response)) + + del inputs, response + + return reord.get_original(res) if self.can_access_output() else None + + def loglikelihood_rolling(self, requests): + loglikelihoods = [] + len_rolling_token_windows = [0] + all_rolling_token_windows = [] + + for (string,) in requests: + rolling_token_windows = list( + map( + utils.make_disjoint_window, + utils.get_rolling_token_windows( + token_list=self.tokenizer.text_to_ids(string), + prefix_token=2, + max_seq_len=self.max_length, + context_len=1, + ), + ) + ) + + len_rolling_token_windows.append(len(rolling_token_windows) + len_rolling_token_windows[-1]) + all_rolling_token_windows.extend(rolling_token_windows) + + string_nll = self._loglikelihood(all_rolling_token_windows) + if self.can_access_output(): + string_nll = [x[0] for x in string_nll] + # discard is_greedy + for i in range(len(len_rolling_token_windows) - 1): + loglikelihoods.append(sum(string_nll[len_rolling_token_windows[i] : len_rolling_token_windows[i + 1]])) + + return loglikelihoods + + def greedy_until(self, requests): + raise NotImplementedError + + def can_access_output(self): + return is_global_rank_zero() diff --git a/launcher_scripts/nemo_launcher/collections/eval_harness/lm_eval/models/nemo_llama_prompt.py b/launcher_scripts/nemo_launcher/collections/eval_harness/lm_eval/models/nemo_llama_prompt.py new file mode 100755 index 0000000000..59be96c5e7 --- /dev/null +++ b/launcher_scripts/nemo_launcher/collections/eval_harness/lm_eval/models/nemo_llama_prompt.py @@ -0,0 +1,174 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import torch +import tqdm +from lm_eval import utils +from lm_eval.base import LM +from nemo.collections.nlp.models.language_modeling.megatron_gpt_prompt_learning_model import ( + MegatronGPTPromptLearningModel, +) +from nemo.collections.nlp.modules.common.megatron.megatron_init import fake_initialize_model_parallel +from nemo.collections.nlp.modules.common.text_generation_utils import generate, get_computeprob_response +from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategy +from nemo.utils import logging +from nemo.utils.app_state import AppState +from nemo.utils.get_rank import is_global_rank_zero +from pytorch_lightning.trainer.trainer import Trainer +from torch.nn.utils.rnn import pad_sequence +from torch.utils.data import DataLoader, Dataset +from torch.utils.data.dataloader import default_collate + +from .nemo_gpt3_prompt import PromptRequestDataset, setup_trainer_and_model, DDP_initialize + +class NeMo_LLAMA_PROMPTLM(LM): + def __init__(self, args, truncate=False, batch_size=1): + super().__init__() + + # get nemo megatron + logging.info(f"**** Building LLaMA Prompt model ...") + self.trainer, self.model = setup_trainer_and_model(args) + self.tokenizer = self.model.tokenizer + self.model.eval() + + self.max_length = self.model.cfg.get("max_position_embeddings") + assert self.tokenizer.text_to_ids("hello\n\nhello") == [ + 22172, + 13, + 13, + 12199, + ], "Tokenizer text_to_ids is not working as expected." + + self.truncate = truncate + self.batch_size = batch_size + + # initialize DDP and move model to GPU + DDP_initialize(self.model) + self.model = self.model.cuda() + + @classmethod + def create_from_arg_string(cls, arg_string, additional_config={}): + args = utils.simple_parse_args_string(arg_string) + args2 = {k: v for k, v in additional_config.items() if v is not None} + return cls(args, **args2) + + def loglikelihood(self, requests): + return self._loglikelihood(requests) + + """ + request: (context, continuation) + how this all works: + CTX CONT + inp 0 1 2 3|4 5 6 7 8 9 <- last token is deleted by inp[:, :-1] + gpt2 \ \ + logits 1 2 3|4 5 6 7 8 9 <- the ctx half gets tossed out by the [:, -len(continuation_enc):, :self.VOCAB_SIZE] slice + cont_toks 4 5 6 7 8 9 + when too long to fit in context, truncate from the left + """ + + def _loglikelihood(self, requests): + def pad_collate(batch, eos_id=2): + tokens, conti_lens, task_ids, *_ = map(list, zip(*batch)) + lens = [len(token) - 1 for token in tokens] # fake delete last token by reducing input len + max_len = max(lens) + + tokens_pad = pad_sequence(tokens, batch_first=False, padding_value=eos_id) + # Add padding to all samples to adapt nemo generate api + # tokens_pad = torch.cat((tokens_pad, torch.ones((1, len(tokens)), dtype=torch.int) * eos_id), 0) + + new_batch = [] + for token, lenn, conti_len, task_id in zip(tokens_pad.T, lens, conti_lens, task_ids): + new_batch.append((token, max_len, task_id, lenn, conti_len)) + + new_batch = default_collate(new_batch) + return new_batch + + def _collate(x): # used to reorder request and remove duplications + """ + the negative sign on len(toks) sorts descending - this has a few advantages: + - time estimates will always be over not underestimates, which is more useful for planning + - to know the size of a batch when going through the list, you know the first one is always the batch padded context length. + this is useful to simplify the batching logic and more importantly to make automatic adaptive batches much much easier to implement + - any OOMs will happen right away rather than near the end + """ + toks = x[0] + x[1] + return -len(toks), tuple(toks) + + reord = utils.Reorderer(requests, _collate) + request_ds = PromptRequestDataset(reord.get_reordered(), self.model.tokenizer) + request_dl = DataLoader(request_ds, collate_fn=pad_collate, batch_size=self.batch_size, shuffle=False) + + def logits_to_results(batch, response): + input_token_ids_batch, _, _, lens, conti_lens = batch + batch_size = len(lens) + assert len(response["token_ids"]) == batch_size, "Response's length not equal to batch size." + + batch_res = [] + for index in range(batch_size): + inp_len = lens[index] + conti_len = conti_lens[index] + + inp_token_ids = input_token_ids_batch[index].tolist()[: inp_len + 1] # recover fake deleted token + + log_probs = response["full_logprob"][index][:inp_len] # torch.tensor + log_probs = log_probs[-conti_len:] + + greedy_tokens = log_probs.argmax(dim=-1) + greedy_tokens = self.tokenizer.ids_to_tokens(greedy_tokens.cpu().numpy().tolist()) + + conti_token_ids = inp_token_ids[-conti_len:] + conti_tokens = self.tokenizer.ids_to_tokens(conti_token_ids) + + max_equal = greedy_tokens == conti_tokens + log_probs = log_probs.cpu().to(torch.float32) + conti_enc = torch.tensor(self.tokenizer.tokens_to_ids(conti_tokens)) + conti_probs = torch.gather(log_probs, 1, conti_enc.unsqueeze(-1)).squeeze(-1) + + batch_res.append((float(conti_probs.sum()), bool(max_equal), greedy_tokens, conti_tokens)) + return batch_res + + res = [] + for batch in tqdm.tqdm(request_dl): + # inputs = (token_ids, conti_lens) + inputs = (batch[0].cuda(), batch[1].cuda()) + task_ids = torch.zeros((self.batch_size, 1), device='cuda') + response = generate( + model=self.model, + inputs=inputs, + task_ids=task_ids, + tokens_to_generate=1, + all_probs=True, + temperature=1.0, + add_BOS=False, + top_k=0, + top_p=0.9, + greedy=True, + repetition_penalty=1.0, + min_tokens_to_generate=0, + ) + response = get_computeprob_response(self.tokenizer, response, inputs) + + if is_global_rank_zero(): + res.extend(logits_to_results(batch, response)) + + return reord.get_original(res) if self.can_access_output() else None + + def loglikelihood_rolling(self, requests): + raise NotImplementedError + + def greedy_until(self, requests): + raise NotImplementedError + + def can_access_output(self): + return is_global_rank_zero() diff --git a/launcher_scripts/nemo_launcher/core/stages.py b/launcher_scripts/nemo_launcher/core/stages.py index b0319ae7ff..3277996eb1 100755 --- a/launcher_scripts/nemo_launcher/core/stages.py +++ b/launcher_scripts/nemo_launcher/core/stages.py @@ -213,8 +213,9 @@ def add_container_mounts(container_mounts): cfg = self.cfg data_dir = cfg.get("data_dir") + nemo_dir = cfg.get("nemo_dir") base_results_dir = cfg.get("base_results_dir") - mounts_string = f"{self._launcher_scripts_path}:{self._launcher_scripts_path},{data_dir}:{data_dir},{base_results_dir}:{base_results_dir}" + mounts_string = f"{self._launcher_scripts_path}:{self._launcher_scripts_path},{data_dir}:{data_dir},{base_results_dir}:{base_results_dir},{nemo_dir}:{nemo_dir}" container_mounts = cfg.get("container_mounts") mounts_string += add_container_mounts(container_mounts) @@ -382,7 +383,7 @@ def _launcher_scripts_path(self) -> Path: @property def _nemo_code_path(self) -> Path: - return Path("/opt/NeMo") + return Path(self.cfg.get("nemo_dir", "/opt/NeMo")) @property def _data_dir(self) -> Path: @@ -976,7 +977,8 @@ class EvalHarnessEvaluation(NemoMegatronStage): def __init__(self, cfg): super().__init__(cfg) choice_model_type, choice_name = self.get_stage_config_choice() - self.prompt_evaluation = choice_model_type == "prompt_gpt3" + #self.prompt_evaluation = choice_model_type == "prompt_gpt3" + self.prompt_evaluation = True if "prompt" in choice_model_type else False def setup_stage_vars(self, cfg): """Setup the stage vars, i.e. stage name and stage cfg""" @@ -1053,6 +1055,7 @@ def make_stage_command_groups(self, stage_cfg_path: Path) -> List[List[str]]: nemo_model=model_cfg.get("nemo_model"), checkpoint_folder=model_cfg.get("checkpoint_folder"), checkpoint_name=model_cfg.get("checkpoint_name"), + tokenizer_model=model_cfg.get("tokenizer_model"), hparams_file=model_cfg.get("hparams_file"), ) From e0a2d64deaccd782d11d39f30207aaa6243cadcc Mon Sep 17 00:00:00 2001 From: Joseph Jennings Date: Thu, 27 Jul 2023 12:32:28 -0700 Subject: [PATCH 05/62] First commit of quality filtering stage --- launcher_scripts/main.py | 19 +- .../core/data_curation_stages.py | 162 ++++++++++++++++++ launcher_scripts/nemo_launcher/core/stages.py | 42 ++--- 3 files changed, 198 insertions(+), 25 deletions(-) create mode 100644 launcher_scripts/nemo_launcher/core/data_curation_stages.py diff --git a/launcher_scripts/main.py b/launcher_scripts/main.py index 4053328f2c..61bec11eb6 100755 --- a/launcher_scripts/main.py +++ b/launcher_scripts/main.py @@ -18,6 +18,7 @@ import hydra import omegaconf from nemo_launcher.core.data_stages import CustomDataPreparation, MC4DataPreparation, PileDataPreparation +from nemo_launcher.core.data_curation_stages import QualityFiltering from nemo_launcher.core.export_stages import Export from nemo_launcher.core.stages import ( AdapterLearning, @@ -30,9 +31,15 @@ Training, ) -omegaconf.OmegaConf.register_new_resolver("multiply", lambda x, y: x * y, replace=True) -omegaconf.OmegaConf.register_new_resolver("divide_ceil", lambda x, y: int(math.ceil(x / y)), replace=True) -omegaconf.OmegaConf.register_new_resolver("divide_floor", lambda x, y: int(math.floor(x / y)), replace=True) +omegaconf.OmegaConf.register_new_resolver("multiply", + lambda x, y: x * y, + replace=True) +omegaconf.OmegaConf.register_new_resolver("divide_ceil", + lambda x, y: int(math.ceil(x / y)), + replace=True) +omegaconf.OmegaConf.register_new_resolver("divide_floor", + lambda x, y: int(math.floor(x / y)), + replace=True) STR2STAGECLASS = { "training": Training, @@ -44,13 +51,17 @@ "export": Export, "evaluation": { EvalHarnessEvaluation: ["gpt3", "prompt_gpt3"], - NeMoEvaluation: ["t5", "mt5", "prompt_t5", "prompt_mt5", "adapter_t5", "adapter_gpt3", "ia3_t5", "ia3_gpt3"], + NeMoEvaluation: [ + "t5", "mt5", "prompt_t5", "prompt_mt5", "adapter_t5", + "adapter_gpt3", "ia3_t5", "ia3_gpt3" + ], }, "data_preparation": { PileDataPreparation: ["gpt3", "t5", "bert"], MC4DataPreparation: ["mt5"], CustomDataPreparation: ["generic"], }, + "quality_filtering": QualityFiltering, } diff --git a/launcher_scripts/nemo_launcher/core/data_curation_stages.py b/launcher_scripts/nemo_launcher/core/data_curation_stages.py new file mode 100644 index 0000000000..c743f16569 --- /dev/null +++ b/launcher_scripts/nemo_launcher/core/data_curation_stages.py @@ -0,0 +1,162 @@ +import copy +import shlex +import omegaconf +from typing import Dict, List +from pathlib import Path + +from nemo_launcher.core.stages import ( + NemoMegatronStage, + create_args_list, + clean_command_groups, +) +from nemo_launcher.core.launchers import AutoLauncher + + +class DataCurationStage(NemoMegatronStage): + def __init__(self, cfg): + super().__init__(cfg) + self.log_folder = Path() + self.conf_folder = Path() + + def setup_folder_and_data(self): + job_path = self.get_job_path() + job_path.folder.mkdir(parents=True, exist_ok=True) + # make the results dir + results_folder = job_path.results_folder + results_folder.mkdir(parents=True, exist_ok=True) + # make the log dir + self.log_folder = Path(job_path.folder, 'log') + self.log_folder.mkdir(parents=True, exist_ok=True) + # Make the conf dir + self.conf_folder = Path(job_path.folder, 'config') + self.conf_folder.mkdir(parents=True, exist_ok=True) + + def _make_cluster_parameters( + self, + cluster: str, + ) -> Dict: + """ + Make a cluster-specific parameters for jobs on different clusters. + Current clusters include bcm(slurm), bcp and interactive. + For example for bcm, it will return slurm parameters: + {'job_name': 'some_name', 'nodes': 2, 'ntasks_per_node': 8, ...} + + :param str cluster: i.e. `bcm`, `bcp`, `interactive`, etc. + :param Optional sub_stage: current sub_stage name + :return: a dictionary of cluster parameters, e.g. `ntasks_per_node` + :rtype: Dict + """ + cfg = self.cfg + stage_cfg = self.stage_cfg + + run_cfg = stage_cfg.get("run") + job_name = run_cfg.get("name") + time_limit = run_cfg.get("time_limit") + nodes = run_cfg.get('nodes') + # Allow for updating the partition as we might run + # on CPU only nodes + partition = run_cfg.get('partition') + + container_image = cfg.get("container") + container_mounts = self._make_container_mounts_string() + + shared_parameters = { + "job_name": job_name, + "time": time_limit, + } + if cluster == "bcm": + cluster_cfg = cfg.get("cluster") + slurm_cfg = {**copy.deepcopy(cluster_cfg)} + job_name_prefix = slurm_cfg.pop("job_name_prefix") + cluster_params = { + **slurm_cfg, + } + cluster_params.update({ + **shared_parameters, + "container_image": container_image, + "container_mounts": container_mounts, + }) + cluster_params[ + "job_name"] = job_name_prefix + cluster_params["job_name"] + cluster_params['nodes'] = nodes + cluster_params['partition'] = partition + + return cluster_params + + def run(self): + self.setup_folder_and_data() + job_path = self.get_job_path() + + cluster_parameters = self._make_cluster_parameters(self.cluster) + stage_cfg_path = NemoMegatronStage.save_stage_hydra_config( + self.stage_cfg, + job_path, + ) + + command_groups = self.make_stage_command_groups(stage_cfg_path) + + launcher = AutoLauncher( + folder=self.get_job_path().folder, + cluster=self.cluster, + **cluster_parameters, + ) + + job_id = launcher.launch(command_groups) + + return job_id + + +class QualityFiltering(DataCurationStage): + def __init__(self, cfg): + super().__init__(cfg) + + def setup_stage_vars(self, cfg): + self.stage_name = "quality_filtering" + self.stage_cfg = cfg.get("quality_filtering") + + def make_stage_command_groups( + self, + stage_cfg_path: Path, + ) -> List[List[str]]: + + stage_cfg = self.stage_cfg + job_path = self.get_job_path() + + # Write out the filter configuration as a separate config file + filter_cfg = Path(self.conf_folder, "heuristic_filter.yaml") + omegaconf.OmegaConf.save(stage_cfg.get('filter'), filter_cfg) + + command_groups = [[]] + + optional_args = { + "output_removed_document_dir": + stage_cfg.get('output_removed_document_dir'), + "output_document_score_dir": + stage_cfg.get('output_document_score_dir'), + } + + # Remove any arguments that are not specified + optional_args = { + arg: optional_args[arg] + for arg in optional_args if optional_args[arg] + } + + args = create_args_list( + replace_underscore=False, + log_dir=self.log_folder, + res_dir=job_path.results_folder, + conf_dir=self.conf_folder, + input_dir=stage_cfg.get("input_dir"), + filter_config_file=f"{self.conf_folder}/{filter_cfg}", + output_retained_document_dir=stage_cfg.get( + "output_retained_document_dir"), + **optional_args, + ) + + core_command = ["filter_documents", *args] + + core_command_string = " \\\n ".join(core_command) + command_groups[-1] += [core_command_string] + command_groups = clean_command_groups(command_groups) + + return command_groups diff --git a/launcher_scripts/nemo_launcher/core/stages.py b/launcher_scripts/nemo_launcher/core/stages.py index 4acd81daa7..6aeafda39d 100755 --- a/launcher_scripts/nemo_launcher/core/stages.py +++ b/launcher_scripts/nemo_launcher/core/stages.py @@ -240,7 +240,7 @@ def _make_cluster_parameters(self, cluster: str) -> Dict: dependency = run_cfg.get("dependency") if nodes is None: nodes = stage_cfg.get("trainer").get("num_nodes") - + ntasks_per_node = run_cfg.get("ntasks_per_node") if ntasks_per_node is None: ntasks_per_node = stage_cfg.get("trainer").get("devices") @@ -287,7 +287,7 @@ def _make_cluster_parameters(self, cluster: str) -> Dict: cluster_parameters.update(shared_parameters) return cluster_parameters - + def _find_optimal_nodes(self, cfg, gpus) -> None: nodes_scheduler_path = f"{cfg.get('training').get('run').get('results_dir')}/nodes_scheduler.json" @@ -320,7 +320,7 @@ def _find_optimal_nodes(self, cfg, gpus) -> None: optimal_lst.append(nodes) self.nodes_scheduler[str(b)] = max(optimal_lst) - + sched_rbs = [int(i) for i in self.nodes_scheduler.keys()] assert rbs[::-1] == sched_rbs, ( "please, make sure you enter the correct combination of" @@ -329,7 +329,7 @@ def _find_optimal_nodes(self, cfg, gpus) -> None: with open(nodes_scheduler_path, 'w') as nodes_scheduler: nodes_scheduler.write(json.dumps(self.nodes_scheduler)) - + def _get_current_gbs(self, cfg): start_bs = cfg.get('training').get('model').get('rampup_batch_size')[0] results_dir = cfg.get('training').get('run').get('results_dir') @@ -340,16 +340,16 @@ def _get_current_gbs(self, cfg): for file in glob.glob("*.out"): file = file.split('_')[-1].split('.')[0] job_numbers.append(int(file)) - + job_number = max(job_numbers) last_job = glob.glob(f"*{job_number}.out")[0] with open(last_job, 'r') as logs: logs = logs.read() - + current_gbs = re.findall(r'global_batch_size=(\d+)', logs)[-1] except: current_gbs = start_bs - + return current_gbs def get_env_vars(self) -> Dict: @@ -528,11 +528,11 @@ def _make_nemo_call_string(self, stage_cfg_path: Path) -> str: def _make_hydra_override(self) -> List: """ Override some existing hydra configurations if necessary. - + Example use cases are: 1. For bcp cluster, `+rank=\${RANK}` is required running some NeMo scripts. Existing hydra config doesn't have `rank` field, so we overwrite on the fly. - 2. Auto blend training dataset by overwriting empty `model.data.data_prefix` as + 2. Auto blend training dataset by overwriting empty `model.data.data_prefix` as `model.data.data_prefix=\$({auto_blend_command})`. Existing `model.data.data_prefix` could be None in cfg, so we overwrite it in this function. """ @@ -578,7 +578,7 @@ def _make_hydra_override(self) -> List: Example use cases are: 1. For bcp cluster, `+rank=\${RANK}` is required running some NeMo scripts. Existing hydra config doesn't have `rank` field, so we overwrite on the fly. - 2. Auto blend training dataset by overwriting empty `model.data.data_prefix` as + 2. Auto blend training dataset by overwriting empty `model.data.data_prefix` as `model.data.data_prefix=\$({auto_blend_command})`. Existing `model.data.data_prefix` could be None in cfg, so we overwrite it in this function. @@ -613,7 +613,7 @@ def _get_nemo_code_path(self, model_type: str) -> Path: For example, `megatron_t5_pretraining.py` for t5 and `megatron_gpt_pretraining.py` for gpt3. :param str model_type: i.e. `gpt3`, `t5`, `mt5`, etc. - :return: path current stage's essential nemo scripts code + :return: path current stage's essential nemo scripts code :rtype: Path """ model_type_to_code_path = { @@ -678,9 +678,9 @@ def _get_nemo_code_path(self, model_type: str) -> Path: """ Provide the essential nemo code path for running the stage, usually different model types use different nemo scripts. For example, `megatron_t5_pretraining.py` for t5 and `megatron_gpt_pretraining.py` for gpt3. - + :param str model_type: i.e. `gpt3`, `t5`, `mt5`, etc. - :return: path current stage's essential nemo scripts code + :return: path current stage's essential nemo scripts code :rtype: Path """ if model_type == "gpt3": @@ -718,9 +718,9 @@ def _get_nemo_code_path(self, model_type: str) -> Path: """ Provide the essential nemo code path for running the stage, usually different model types use different nemo scripts. For example, `megatron_t5_pretraining.py` for t5 and `megatron_gpt_pretraining.py` for gpt3. - + :param str model_type: i.e. `gpt3`, `t5`, `mt5`, etc. - :return: path current stage's essential nemo scripts code + :return: path current stage's essential nemo scripts code :rtype: Path """ model_type_to_code_path = { @@ -741,9 +741,9 @@ def _get_nemo_code_path(self, model_type: str) -> Path: """ Provide the essential nemo code path for running the stage, usually different model types use different nemo scripts. For example, `megatron_t5_pretraining.py` for t5 and `megatron_gpt_pretraining.py` for gpt3. - + :param str model_type: i.e. `gpt3`, `t5`, `mt5`, etc. - :return: path current stage's essential nemo scripts code + :return: path current stage's essential nemo scripts code :rtype: Path """ model_type_to_code_path = { @@ -763,9 +763,9 @@ def _get_nemo_code_path(self, model_type: str) -> Path: """ Provide the essential nemo code path for running the stage, usually different model types use different nemo scripts. For example, `megatron_t5_pretraining.py` for t5 and `megatron_gpt_pretraining.py` for gpt3. - + :param str model_type: i.e. `gpt3`, `t5`, `mt5`, etc. - :return: path current stage's essential nemo scripts code + :return: path current stage's essential nemo scripts code :rtype: Path """ model_type_to_code_path = { @@ -945,9 +945,9 @@ def _get_nemo_code_path(self, model_type: str) -> Path: """ Provide the essential nemo code path for running the stage, usually different model types use different nemo scripts. For example, `megatron_t5_pretraining.py` for t5 and `megatron_gpt_pretraining.py` for gpt3. - + :param str model_type: i.e. `gpt3`, `t5`, `mt5`, etc. - :return: path current stage's essential nemo scripts code + :return: path current stage's essential nemo scripts code :rtype: Path """ if model_type in ["gpt3", "prompt_gpt3"]: From ba9539e696999fa3deb90c355a7c8c406e9d070b Mon Sep 17 00:00:00 2001 From: Joseph Jennings Date: Thu, 27 Jul 2023 12:53:09 -0700 Subject: [PATCH 06/62] Add config for sub stage --- .../quality_filtering/heuristic/english.yaml | 126 ++++++++++++++++++ 1 file changed, 126 insertions(+) create mode 100644 launcher_scripts/conf/quality_filtering/heuristic/english.yaml diff --git a/launcher_scripts/conf/quality_filtering/heuristic/english.yaml b/launcher_scripts/conf/quality_filtering/heuristic/english.yaml new file mode 100644 index 0000000000..9e7d6fedb0 --- /dev/null +++ b/launcher_scripts/conf/quality_filtering/heuristic/english.yaml @@ -0,0 +1,126 @@ +run: + name: 'heuristic-filter-en' + results_dir: ${base_results_dir}/${.name} + time_limit: "08:00:00" + dependency: "singleton" + nodes: 1 + partition: + cpus_per_node: 48 + +# Provide the downloader, data loader and extraction modules that +# define how the dataset will be built from the URLs +filter: + filter_module: ndc.filter.heuristics.filter.CascadedHeuristicFilter + params: + # The filters below define a chain of heuristic filters to be applied to each document in a corpus. + # This particular cascade of filters is intended to filter English language data. + # The filter listed at the top will be applied first, and the following filters will be applied in + # the order they appear in this file. Each filter can be removed and re-ordered as desired. + # New filters can be added as described in docs/1_document_filtering.rst + filters: + - name: ndc.filter.heuristics.filter.NonAlphaNumericFilter + params: + max_non_alpha_numeric_to_text_ratio: 0.25 + - name: ndc.filter.heuristics.filter.SymbolsToWordsFilter + params: + max_symbol_to_word_ratio: 0.1 + - name: ndc.filter.heuristics.filter.NumbersFilter + params: + max_number_to_text_ratio: 0.15 + - name: ndc.filter.heuristics.filter.UrlsFilter + params: + max_url_to_text_ratio: 0.2 + - name: ndc.filter.heuristics.filter.WhiteSpaceFilter + params: + max_white_space_ratio: 0.25 + - name: ndc.filter.heuristics.filter.ParenthesesFilter + params: + max_parentheses_ratio: 0.1 + - name: ndc.filter.heuristics.filter.BoilerPlateStringFilter + params: + remove_if_at_top_or_bottom: True + max_boilerplate_string_ratio: 0.4 + - name: ndc.filter.heuristics.filter.RepeatedLinesFilter + params: + max_repeated_line_fraction: 0.7 + - name: ndc.filter.heuristics.filter.RepeatedParagraphsFilter + params: + max_repeated_paragraphs_ratio: 0.7 + - name: ndc.filter.heuristics.filter.RepeatedLinesByCharFilter + params: + max_repeated_lines_char_ratio: 0.8 + - name: ndc.filter.heuristics.filter.RepeatedParagraphsByCharFilter + params: + max_repeated_paragraphs_char_ratio: 0.8 + - name: ndc.filter.heuristics.filter.WordCountFilter + params: + min_words: 50 + max_words: 100000 + - name: ndc.filter.heuristics.filter.PunctuationFilter + params: + max_num_sentences_without_endmark_ratio: 0.85 + - name: ndc.filter.heuristics.filter.WordsWithoutAlphabetsFilter + params: + max_words_without_alphabets: 0.8 + - name: ndc.filter.heuristics.filter.CommonEnglishWordsFilter + params: + min_num_common_words: 2 + stop_at_false: True + - name: ndc.filter.heuristics.filter.MeanWordLengthFilter + params: + max_mean_word_length: 10 + min_mean_word_length: 3 + - name: ndc.filter.heuristics.filter.LongWordFilter + params: + max_word_length: 1000 + - name: ndc.filter.heuristics.filter.EllipsisFilter + params: + max_num_lines_ending_with_ellipsis_ratio: 0.3 + # Top N-Gram filters for N-grams 2, 3, and 4 + - name: ndc.filter.heuristics.filter.RepeatingTopNGramsFilter + params: + n: 2 + max_repeating_ngram_ratio: 0.2 + - name: ndc.filter.heuristics.filter.RepeatingTopNGramsFilter + params: + n: 3 + max_repeating_ngram_ratio: 0.18 + - name: ndc.filter.heuristics.filter.RepeatingTopNGramsFilter + params: + n: 4 + max_repeating_ngram_ratio: 0.16 + # Duplicate N-gram filters for N-grams 5, 6, 7, 8, 9, and 10 + - name: ndc.filter.heuristics.filter.RepeatingDuplicateNGramsFilter + params: + n: 5 + max_repeating_duplicate_ngram_ratio: 0.15 + - name: ndc.filter.heuristics.filter.RepeatingDuplicateNGramsFilter + params: + n: 6 + max_repeating_duplicate_ngram_ratio: 0.14 + - name: ndc.filter.heuristics.filter.RepeatingDuplicateNGramsFilter + params: + n: 7 + max_repeating_duplicate_ngram_ratio: 0.13 + - name: ndc.filter.heuristics.filter.RepeatingDuplicateNGramsFilter + params: + n: 8 + max_repeating_duplicate_ngram_ratio: 0.12 + - name: ndc.filter.heuristics.filter.RepeatingDuplicateNGramsFilter + params: + n: 9 + max_repeating_duplicate_ngram_ratio: 0.11 + - name: ndc.filter.heuristics.filter.RepeatingDuplicateNGramsFilter + params: + n: 10 + max_repeating_duplicate_ngram_ratio: 0.10 + - name: ndc.filter.heuristics.filter.BulletsFilter + params: + max_bullet_lines_ratio: 0.9 + # If True, the chained operation defined by the filters above + # will stop at first filter that is triggered during the above defined pipeline + stop_at_true: True + +input_dir: ${data_dir}/json/original +# Output directory to where filtered documents will be written +output_retained_document_dir: ${data_dir}/json/filtered/high_quality From fe11fab098f2c303548e436152c9ad8be5bc64c0 Mon Sep 17 00:00:00 2001 From: Joseph Jennings Date: Thu, 27 Jul 2023 18:27:36 -0700 Subject: [PATCH 07/62] Fix some path errors --- .../nemo_launcher/core/data_curation_stages.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/launcher_scripts/nemo_launcher/core/data_curation_stages.py b/launcher_scripts/nemo_launcher/core/data_curation_stages.py index c743f16569..f4872c562b 100644 --- a/launcher_scripts/nemo_launcher/core/data_curation_stages.py +++ b/launcher_scripts/nemo_launcher/core/data_curation_stages.py @@ -120,7 +120,6 @@ def make_stage_command_groups( ) -> List[List[str]]: stage_cfg = self.stage_cfg - job_path = self.get_job_path() # Write out the filter configuration as a separate config file filter_cfg = Path(self.conf_folder, "heuristic_filter.yaml") @@ -142,12 +141,10 @@ def make_stage_command_groups( } args = create_args_list( - replace_underscore=False, + replace_underscore=True, log_dir=self.log_folder, - res_dir=job_path.results_folder, - conf_dir=self.conf_folder, - input_dir=stage_cfg.get("input_dir"), - filter_config_file=f"{self.conf_folder}/{filter_cfg}", + input_data_dir=stage_cfg.get("input_dir"), + filter_config_file=f"{filter_cfg}", output_retained_document_dir=stage_cfg.get( "output_retained_document_dir"), **optional_args, From fd30fde3c52a03346c71a295c3a76572470093f3 Mon Sep 17 00:00:00 2001 From: Joseph Jennings Date: Fri, 28 Jul 2023 08:47:25 -0700 Subject: [PATCH 08/62] Change formatting to be consistent with current code. Add additional comments --- launcher_scripts/main.py | 17 ++--- .../core/data_curation_stages.py | 62 +++++++++++++------ 2 files changed, 46 insertions(+), 33 deletions(-) diff --git a/launcher_scripts/main.py b/launcher_scripts/main.py index 61bec11eb6..744f6cefa5 100755 --- a/launcher_scripts/main.py +++ b/launcher_scripts/main.py @@ -31,15 +31,9 @@ Training, ) -omegaconf.OmegaConf.register_new_resolver("multiply", - lambda x, y: x * y, - replace=True) -omegaconf.OmegaConf.register_new_resolver("divide_ceil", - lambda x, y: int(math.ceil(x / y)), - replace=True) -omegaconf.OmegaConf.register_new_resolver("divide_floor", - lambda x, y: int(math.floor(x / y)), - replace=True) +omegaconf.OmegaConf.register_new_resolver("multiply", lambda x, y: x * y, replace=True) +omegaconf.OmegaConf.register_new_resolver("divide_ceil", lambda x, y: int(math.ceil(x / y)), replace=True) +omegaconf.OmegaConf.register_new_resolver("divide_floor", lambda x, y: int(math.floor(x / y)), replace=True) STR2STAGECLASS = { "training": Training, @@ -51,10 +45,7 @@ "export": Export, "evaluation": { EvalHarnessEvaluation: ["gpt3", "prompt_gpt3"], - NeMoEvaluation: [ - "t5", "mt5", "prompt_t5", "prompt_mt5", "adapter_t5", - "adapter_gpt3", "ia3_t5", "ia3_gpt3" - ], + NeMoEvaluation: ["t5", "mt5", "prompt_t5", "prompt_mt5", "adapter_t5", "adapter_gpt3", "ia3_t5", "ia3_gpt3"], }, "data_preparation": { PileDataPreparation: ["gpt3", "t5", "bert"], diff --git a/launcher_scripts/nemo_launcher/core/data_curation_stages.py b/launcher_scripts/nemo_launcher/core/data_curation_stages.py index f4872c562b..902132b992 100644 --- a/launcher_scripts/nemo_launcher/core/data_curation_stages.py +++ b/launcher_scripts/nemo_launcher/core/data_curation_stages.py @@ -13,12 +13,24 @@ class DataCurationStage(NemoMegatronStage): + """ + DataCurationStage is a base class for data curation stages. + It can hold multiple sub-stages. For example, preparing data from + Common Crawl requires download, extraction, deduplication and filtering. + They have dependencies on each other and will be launched one by one. + """ + def __init__(self, cfg): super().__init__(cfg) self.log_folder = Path() self.conf_folder = Path() def setup_folder_and_data(self): + """ + Each job in the data curation pipeline creates a directory + for writing logs (log_folder), writing and reading intermediate + results (results_folder) and for reading configs (conf_folder) + """ job_path = self.get_job_path() job_path.folder.mkdir(parents=True, exist_ok=True) # make the results dir @@ -31,21 +43,18 @@ def setup_folder_and_data(self): self.conf_folder = Path(job_path.folder, 'config') self.conf_folder.mkdir(parents=True, exist_ok=True) - def _make_cluster_parameters( - self, - cluster: str, - ) -> Dict: + def _make_cluster_parameters(self, cluster: str) -> Dict: + """ + Make a cluster-specific parameters for jobs on different clusters. + Current clusters include bcm(slurm), bcp and interactive. + For example for bcm, it will return slurm parameters: + {'job_name': 'some_name', 'nodes': 2, 'ntasks_per_node': 8, ...} + + :param str cluster: i.e. `bcm`, `bcp`, `interactive`, etc. + :param Optional sub_stage: current sub_stage name + :return: a dictionary of cluster parameters, e.g. `ntasks_per_node` + :rtype: Dict """ - Make a cluster-specific parameters for jobs on different clusters. - Current clusters include bcm(slurm), bcp and interactive. - For example for bcm, it will return slurm parameters: - {'job_name': 'some_name', 'nodes': 2, 'ntasks_per_node': 8, ...} - - :param str cluster: i.e. `bcm`, `bcp`, `interactive`, etc. - :param Optional sub_stage: current sub_stage name - :return: a dictionary of cluster parameters, e.g. `ntasks_per_node` - :rtype: Dict - """ cfg = self.cfg stage_cfg = self.stage_cfg @@ -83,42 +92,53 @@ def _make_cluster_parameters( return cluster_params - def run(self): + def run(self) -> str: + """ + Run current stage including all of the substages, returns job id on slurm based system otherwise empty string + + :return: job id on slurm based system otherwise empty string + :rtype: str + """ + # Create the job folders self.setup_folder_and_data() job_path = self.get_job_path() + # Make cluster configuration parameters cluster_parameters = self._make_cluster_parameters(self.cluster) stage_cfg_path = NemoMegatronStage.save_stage_hydra_config( self.stage_cfg, job_path, ) + # Build commands to launch on cluster command_groups = self.make_stage_command_groups(stage_cfg_path) + # Create the launcher for the cluster launcher = AutoLauncher( folder=self.get_job_path().folder, cluster=self.cluster, **cluster_parameters, ) + # Launch the job on the cluster job_id = launcher.launch(command_groups) return job_id class QualityFiltering(DataCurationStage): + """ DataCurationStage for performing quality filtering on documents """ + def __init__(self, cfg): super().__init__(cfg) def setup_stage_vars(self, cfg): + """Setup the stage vars, i.e. stage name and stage cfg""" self.stage_name = "quality_filtering" self.stage_cfg = cfg.get("quality_filtering") - def make_stage_command_groups( - self, - stage_cfg_path: Path, - ) -> List[List[str]]: - + def make_stage_command_groups(self, stage_cfg_path: Path) -> List[List[str]]: + """ Builds the command groups for the current stage """ stage_cfg = self.stage_cfg # Write out the filter configuration as a separate config file @@ -127,6 +147,7 @@ def make_stage_command_groups( command_groups = [[]] + # If certain arguments are not specified, we remove them from the list optional_args = { "output_removed_document_dir": stage_cfg.get('output_removed_document_dir'), @@ -140,6 +161,7 @@ def make_stage_command_groups( for arg in optional_args if optional_args[arg] } + # Create the list of arguments for the filter_documents command args = create_args_list( replace_underscore=True, log_dir=self.log_folder, From a3a521a19b90b847b70d1955a921b6e809e848d0 Mon Sep 17 00:00:00 2001 From: Joseph Jennings Date: Fri, 28 Jul 2023 08:52:14 -0700 Subject: [PATCH 09/62] Add quality filtering to base config --- launcher_scripts/conf/config.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/launcher_scripts/conf/config.yaml b/launcher_scripts/conf/config.yaml index 50ada47bdc..0b72956ac4 100755 --- a/launcher_scripts/conf/config.yaml +++ b/launcher_scripts/conf/config.yaml @@ -2,6 +2,7 @@ defaults: - _self_ - cluster: bcm # Leave it as bcm even if using bcp. It will be ignored for bcp. - data_preparation: gpt3/download_gpt3_pile + - quality_filtering: heuristic/english - training: gpt3/5b - conversion: gpt3/convert_gpt3 - fine_tuning: null @@ -57,6 +58,7 @@ numa_mapping: # Do not modify below, use the values above instead. data_preparation_config: ${hydra:runtime.choices.data_preparation} +quality_filtering_config: ${hydra:runtime.choices.quality_filtering} training_config: ${hydra:runtime.choices.training} fine_tuning_config: ${hydra:runtime.choices.fine_tuning} prompt_learning_config: ${hydra:runtime.choices.prompt_learning} From 911b22d0881cf52ccc687e1862ac370ea1ee2fb7 Mon Sep 17 00:00:00 2001 From: Joseph Jennings Date: Thu, 3 Aug 2023 06:35:32 -0700 Subject: [PATCH 10/62] Add documentation relating to task deduplication --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index ed1870da84..4478dfd0c7 100755 --- a/README.md +++ b/README.md @@ -5163,6 +5163,8 @@ Currently, within the NeMo Data Curator, we support the following data-curation - Fuzzy deduplication. Our implementation of fuzzy deduplication builds off of the following existing libraries: - For computing MinHash signatures we use a modified version of the MinHasher class provided in [pyLSH](https://github.com/mattilyra/LSH) - For the locality sensitive hashing, we extended the Redis-based implementation found in [datasketch](https://github.com/ekzhu/datasketch) beyond a single Redis server to a Redis Cluster. This enables this module to efficiently deduplicate large datasets that do not fit in memory of a single node (e.g., several TB of text) + - Multilingual downstream-task decontamination + - Our implementation follows the approach of [OpenAI GPT3](https://arxiv.org/pdf/2005.14165.pdf) and [Microsoft Turing NLG 530B](https://arxiv.org/abs/2201.11990) The modules are implemented in a scalable manner using [Message Passing Interface (MPI) for Python (mpi4py)](https://mpi4py.readthedocs.io/en/stable/) and we use [Dask](https://dask.org) for creating balanced input jsonl files. With the scalable modules within the NeMo Data Curator, we have been have been able to fully process a [Common Crawl Snapshot](https://commoncrawl.org/2020/12/nov-dec-2020-crawl-archive-now-available/) (consisting of 60 TB of compressed WARC files) in approximately two days using 30 CPU nodes (with hardware similar to the `c5.24xlarge` [Amazon AWS C5 instance](https://aws.amazon.com/ec2/instance-types/c5/)). Please note that the core functions used within the NeMo Data Curator (e.g., html extraction, text cleaning, heuristic filtering, etc.) have not been fully optimized. The main goal of the NeMo Data Curator is to provide users the capability to apply these functions to their large datasets using many compute nodes. From 056a90edf9140a32bfd39652509244e7db055651 Mon Sep 17 00:00:00 2001 From: Dmytro Pykhtar Date: Mon, 7 Aug 2023 12:51:40 -0700 Subject: [PATCH 11/62] ptl bug fix Signed-off-by: Dmytro Pykhtar --- auto_configurator/base_configs/bert.yaml | 2 +- auto_configurator/base_configs/gpt3.yaml | 2 +- auto_configurator/base_configs/mt5.yaml | 2 +- auto_configurator/base_configs/t5.yaml | 2 +- .../tests/base_configs_tests/test_base_configs.py | 8 ++++---- launcher_scripts/conf/adapter_learning/gpt3/squad.yaml | 2 +- launcher_scripts/conf/adapter_learning/t5/squad.yaml | 2 +- launcher_scripts/conf/evaluation/adapter_gpt3/squad.yaml | 2 +- launcher_scripts/conf/evaluation/adapter_t5/squad.yaml | 2 +- launcher_scripts/conf/evaluation/ia3_gpt3/squad.yaml | 2 +- launcher_scripts/conf/evaluation/ia3_t5/squad.yaml | 2 +- launcher_scripts/conf/evaluation/mt5/custom_task.yaml | 2 +- launcher_scripts/conf/evaluation/mt5/xquad.yaml | 2 +- launcher_scripts/conf/evaluation/prompt_mt5/squad.yaml | 2 +- launcher_scripts/conf/evaluation/prompt_t5/squad.yaml | 2 +- launcher_scripts/conf/evaluation/t5/custom_task.yaml | 2 +- launcher_scripts/conf/evaluation/t5/squad.yaml | 2 +- launcher_scripts/conf/fine_tuning/gpt3/custom_task.yaml | 2 +- launcher_scripts/conf/fine_tuning/gpt3/squad.yaml | 2 +- launcher_scripts/conf/fine_tuning/mt5/custom_task.yaml | 2 +- launcher_scripts/conf/fine_tuning/mt5/xquad.yaml | 2 +- launcher_scripts/conf/fine_tuning/t5/custom_task.yaml | 2 +- launcher_scripts/conf/fine_tuning/t5/squad.yaml | 2 +- launcher_scripts/conf/ia3_learning/gpt3/squad.yaml | 2 +- launcher_scripts/conf/ia3_learning/t5/squad.yaml | 2 +- launcher_scripts/conf/prompt_learning/gpt3/squad.yaml | 2 +- launcher_scripts/conf/prompt_learning/mt5/squad.yaml | 2 +- launcher_scripts/conf/prompt_learning/t5/squad.yaml | 2 +- launcher_scripts/conf/training/bert/100b.yaml | 2 +- launcher_scripts/conf/training/bert/110m.yaml | 2 +- launcher_scripts/conf/training/bert/20b.yaml | 2 +- launcher_scripts/conf/training/bert/4b.yaml | 2 +- launcher_scripts/conf/training/gpt3/126m.yaml | 2 +- launcher_scripts/conf/training/gpt3/175b.yaml | 2 +- launcher_scripts/conf/training/gpt3/175b_performance.yaml | 2 +- launcher_scripts/conf/training/gpt3/1b_improved.yaml | 2 +- launcher_scripts/conf/training/gpt3/20b.yaml | 2 +- launcher_scripts/conf/training/gpt3/400m_improved.yaml | 2 +- launcher_scripts/conf/training/gpt3/40b.yaml | 2 +- launcher_scripts/conf/training/gpt3/40b_improved.yaml | 2 +- launcher_scripts/conf/training/gpt3/5b.yaml | 2 +- launcher_scripts/conf/training/gpt3/7b_improved.yaml | 2 +- launcher_scripts/conf/training/mt5/11b.yaml | 2 +- launcher_scripts/conf/training/mt5/170m.yaml | 2 +- launcher_scripts/conf/training/mt5/23b.yaml | 2 +- launcher_scripts/conf/training/mt5/390m.yaml | 2 +- launcher_scripts/conf/training/mt5/3b.yaml | 2 +- launcher_scripts/conf/training/t5/11b.yaml | 2 +- launcher_scripts/conf/training/t5/220m.yaml | 2 +- launcher_scripts/conf/training/t5/23b.yaml | 2 +- launcher_scripts/conf/training/t5/3b.yaml | 2 +- launcher_scripts/conf/training/t5/41b.yaml | 2 +- 52 files changed, 55 insertions(+), 55 deletions(-) diff --git a/auto_configurator/base_configs/bert.yaml b/auto_configurator/base_configs/bert.yaml index 305040666e..01e3be140e 100644 --- a/auto_configurator/base_configs/bert.yaml +++ b/auto_configurator/base_configs/bert.yaml @@ -14,7 +14,7 @@ trainer: precision: bf16 logger: False # logger provided by exp_manager enable_checkpointing: False - replace_sampler_ddp: False + use_distributed_sampler: False max_epochs: -1 # PTL default. In practice we don't usually train for more than 1 epoch. max_steps: 100000 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches max_time: "00:23:30:00" diff --git a/auto_configurator/base_configs/gpt3.yaml b/auto_configurator/base_configs/gpt3.yaml index 4eeaf79ce2..a69ba139eb 100644 --- a/auto_configurator/base_configs/gpt3.yaml +++ b/auto_configurator/base_configs/gpt3.yaml @@ -11,7 +11,7 @@ trainer: precision: bf16 logger: False # logger provided by exp_manager enable_checkpointing: False - replace_sampler_ddp: False + use_distributed_sampler: False max_epochs: null max_steps: 600000 # consumed_samples = global_step * global_batch_size max_time: "00:23:30:00" # days:hours:minutes:seconds diff --git a/auto_configurator/base_configs/mt5.yaml b/auto_configurator/base_configs/mt5.yaml index 96053b9ac4..a0f3d70d8a 100644 --- a/auto_configurator/base_configs/mt5.yaml +++ b/auto_configurator/base_configs/mt5.yaml @@ -16,7 +16,7 @@ trainer: precision: bf16 logger: False # logger provided by exp_manager enable_checkpointing: False - replace_sampler_ddp: False + use_distributed_sampler: False max_epochs: null max_steps: 1000000 # consumed_samples = global_step * global_batch_size max_time: "06:23:30:00" diff --git a/auto_configurator/base_configs/t5.yaml b/auto_configurator/base_configs/t5.yaml index cd1ef0ac87..06c6016f78 100644 --- a/auto_configurator/base_configs/t5.yaml +++ b/auto_configurator/base_configs/t5.yaml @@ -14,7 +14,7 @@ trainer: precision: bf16 logger: False # logger provided by exp_manager enable_checkpointing: False - replace_sampler_ddp: False + use_distributed_sampler: False max_epochs: null max_steps: 1000000 # consumed_samples = global_step * global_batch_size max_time: "06:23:30:00" diff --git a/auto_configurator/tests/base_configs_tests/test_base_configs.py b/auto_configurator/tests/base_configs_tests/test_base_configs.py index 0919ee65ab..4fb155628d 100644 --- a/auto_configurator/tests/base_configs_tests/test_base_configs.py +++ b/auto_configurator/tests/base_configs_tests/test_base_configs.py @@ -18,7 +18,7 @@ def test_gpt3_base_config(self): precision: bf16 logger: False enable_checkpointing: False - replace_sampler_ddp: False + use_distributed_sampler: False max_epochs: null max_steps: 600000 max_time: "00:23:30:00" @@ -196,7 +196,7 @@ def test_t5_base_config(self): precision: bf16 logger: False # logger provided by exp_manager enable_checkpointing: False - replace_sampler_ddp: False + use_distributed_sampler: False max_epochs: null max_steps: 1000000 # consumed_samples = global_step * global_batch_size max_time: "06:23:30:00" @@ -421,7 +421,7 @@ def test_mt5_base_config(self): precision: bf16 logger: False # logger provided by exp_manager enable_checkpointing: False - replace_sampler_ddp: False + use_distributed_sampler: False max_epochs: null max_steps: 1000000 # consumed_samples = global_step * global_batch_size max_time: "06:23:30:00" @@ -642,7 +642,7 @@ def test_bert_base_config(self): precision: bf16 logger: False # logger provided by exp_manager enable_checkpointing: False - replace_sampler_ddp: False + use_distributed_sampler: False max_epochs: -1 # PTL default. In practice we don't usually train for more than 1 epoch. max_steps: 100000 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches max_time: "00:23:30:00" diff --git a/launcher_scripts/conf/adapter_learning/gpt3/squad.yaml b/launcher_scripts/conf/adapter_learning/gpt3/squad.yaml index 5abbbc9cb3..fe2ceea017 100755 --- a/launcher_scripts/conf/adapter_learning/gpt3/squad.yaml +++ b/launcher_scripts/conf/adapter_learning/gpt3/squad.yaml @@ -15,7 +15,7 @@ trainer: precision: bf16 logger: False enable_checkpointing: False - replace_sampler_ddp: False + use_distributed_sampler: False max_epochs: 4 max_steps: -1 log_every_n_steps: 10 diff --git a/launcher_scripts/conf/adapter_learning/t5/squad.yaml b/launcher_scripts/conf/adapter_learning/t5/squad.yaml index f82940d489..a5fc08f7a0 100755 --- a/launcher_scripts/conf/adapter_learning/t5/squad.yaml +++ b/launcher_scripts/conf/adapter_learning/t5/squad.yaml @@ -15,7 +15,7 @@ trainer: precision: bf16 logger: False enable_checkpointing: False - replace_sampler_ddp: False + use_distributed_sampler: False max_epochs: 4 max_steps: -1 log_every_n_steps: 10 diff --git a/launcher_scripts/conf/evaluation/adapter_gpt3/squad.yaml b/launcher_scripts/conf/evaluation/adapter_gpt3/squad.yaml index 75c9774e14..a7dbd31065 100755 --- a/launcher_scripts/conf/evaluation/adapter_gpt3/squad.yaml +++ b/launcher_scripts/conf/evaluation/adapter_gpt3/squad.yaml @@ -14,7 +14,7 @@ trainer: precision: bf16 logger: False # logger provided by exp_manager enable_checkpointing: False - replace_sampler_ddp: False + use_distributed_sampler: False log_every_n_steps: 10 inference: diff --git a/launcher_scripts/conf/evaluation/adapter_t5/squad.yaml b/launcher_scripts/conf/evaluation/adapter_t5/squad.yaml index d18cc08856..91d2cec798 100755 --- a/launcher_scripts/conf/evaluation/adapter_t5/squad.yaml +++ b/launcher_scripts/conf/evaluation/adapter_t5/squad.yaml @@ -14,7 +14,7 @@ trainer: precision: bf16 logger: False # logger provided by exp_manager enable_checkpointing: False - replace_sampler_ddp: False + use_distributed_sampler: False log_every_n_steps: 10 inference: diff --git a/launcher_scripts/conf/evaluation/ia3_gpt3/squad.yaml b/launcher_scripts/conf/evaluation/ia3_gpt3/squad.yaml index d109a98557..046d7c9ae0 100755 --- a/launcher_scripts/conf/evaluation/ia3_gpt3/squad.yaml +++ b/launcher_scripts/conf/evaluation/ia3_gpt3/squad.yaml @@ -14,7 +14,7 @@ trainer: precision: bf16 logger: False # logger provided by exp_manager enable_checkpointing: False - replace_sampler_ddp: False + use_distributed_sampler: False log_every_n_steps: 10 inference: diff --git a/launcher_scripts/conf/evaluation/ia3_t5/squad.yaml b/launcher_scripts/conf/evaluation/ia3_t5/squad.yaml index 48480c074a..40b9594f68 100755 --- a/launcher_scripts/conf/evaluation/ia3_t5/squad.yaml +++ b/launcher_scripts/conf/evaluation/ia3_t5/squad.yaml @@ -14,7 +14,7 @@ trainer: precision: bf16 logger: False # logger provided by exp_manager enable_checkpointing: False - replace_sampler_ddp: False + use_distributed_sampler: False log_every_n_steps: 10 inference: diff --git a/launcher_scripts/conf/evaluation/mt5/custom_task.yaml b/launcher_scripts/conf/evaluation/mt5/custom_task.yaml index ce3523d3e7..128937204b 100755 --- a/launcher_scripts/conf/evaluation/mt5/custom_task.yaml +++ b/launcher_scripts/conf/evaluation/mt5/custom_task.yaml @@ -14,7 +14,7 @@ trainer: precision: bf16 logger: False # logger provided by exp_manager enable_checkpointing: False - replace_sampler_ddp: False + use_distributed_sampler: False log_every_n_steps: 10 diff --git a/launcher_scripts/conf/evaluation/mt5/xquad.yaml b/launcher_scripts/conf/evaluation/mt5/xquad.yaml index 6d733fec7f..89771d546b 100755 --- a/launcher_scripts/conf/evaluation/mt5/xquad.yaml +++ b/launcher_scripts/conf/evaluation/mt5/xquad.yaml @@ -14,7 +14,7 @@ trainer: precision: bf16 logger: False # logger provided by exp_manager enable_checkpointing: False - replace_sampler_ddp: False + use_distributed_sampler: False log_every_n_steps: 10 exp_manager: diff --git a/launcher_scripts/conf/evaluation/prompt_mt5/squad.yaml b/launcher_scripts/conf/evaluation/prompt_mt5/squad.yaml index 01278be854..a223289ffc 100755 --- a/launcher_scripts/conf/evaluation/prompt_mt5/squad.yaml +++ b/launcher_scripts/conf/evaluation/prompt_mt5/squad.yaml @@ -14,7 +14,7 @@ trainer: precision: bf16 logger: False # logger provided by exp_manager enable_checkpointing: False - replace_sampler_ddp: False + use_distributed_sampler: False log_every_n_steps: 10 data: diff --git a/launcher_scripts/conf/evaluation/prompt_t5/squad.yaml b/launcher_scripts/conf/evaluation/prompt_t5/squad.yaml index 7b549fedf7..c1fb88caed 100755 --- a/launcher_scripts/conf/evaluation/prompt_t5/squad.yaml +++ b/launcher_scripts/conf/evaluation/prompt_t5/squad.yaml @@ -14,7 +14,7 @@ trainer: precision: bf16 logger: False # logger provided by exp_manager enable_checkpointing: False - replace_sampler_ddp: False + use_distributed_sampler: False log_every_n_steps: 10 data: diff --git a/launcher_scripts/conf/evaluation/t5/custom_task.yaml b/launcher_scripts/conf/evaluation/t5/custom_task.yaml index 90e0ebb38d..2959469ccd 100755 --- a/launcher_scripts/conf/evaluation/t5/custom_task.yaml +++ b/launcher_scripts/conf/evaluation/t5/custom_task.yaml @@ -14,7 +14,7 @@ trainer: precision: bf16 logger: False # logger provided by exp_manager enable_checkpointing: False - replace_sampler_ddp: False + use_distributed_sampler: False log_every_n_steps: 10 diff --git a/launcher_scripts/conf/evaluation/t5/squad.yaml b/launcher_scripts/conf/evaluation/t5/squad.yaml index f50843d82f..39c954a943 100755 --- a/launcher_scripts/conf/evaluation/t5/squad.yaml +++ b/launcher_scripts/conf/evaluation/t5/squad.yaml @@ -14,7 +14,7 @@ trainer: precision: bf16 logger: False # logger provided by exp_manager enable_checkpointing: False - replace_sampler_ddp: False + use_distributed_sampler: False log_every_n_steps: 10 diff --git a/launcher_scripts/conf/fine_tuning/gpt3/custom_task.yaml b/launcher_scripts/conf/fine_tuning/gpt3/custom_task.yaml index 9d9ebabd1b..e55341fb9b 100644 --- a/launcher_scripts/conf/fine_tuning/gpt3/custom_task.yaml +++ b/launcher_scripts/conf/fine_tuning/gpt3/custom_task.yaml @@ -15,7 +15,7 @@ trainer: precision: bf16 logger: False # logger provided by exp_manager enable_checkpointing: False - replace_sampler_ddp: False + use_distributed_sampler: False max_epochs: 1 max_steps: -1 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches log_every_n_steps: 10 # frequency with which training steps are logged diff --git a/launcher_scripts/conf/fine_tuning/gpt3/squad.yaml b/launcher_scripts/conf/fine_tuning/gpt3/squad.yaml index 4730f2f1ae..17dfb0fdc7 100644 --- a/launcher_scripts/conf/fine_tuning/gpt3/squad.yaml +++ b/launcher_scripts/conf/fine_tuning/gpt3/squad.yaml @@ -15,7 +15,7 @@ trainer: precision: bf16 logger: False # logger provided by exp_manager enable_checkpointing: False - replace_sampler_ddp: False + use_distributed_sampler: False max_epochs: null max_steps: 13000 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches log_every_n_steps: 10 # frequency with which training steps are logged diff --git a/launcher_scripts/conf/fine_tuning/mt5/custom_task.yaml b/launcher_scripts/conf/fine_tuning/mt5/custom_task.yaml index bea1aacee8..abd3c2565c 100755 --- a/launcher_scripts/conf/fine_tuning/mt5/custom_task.yaml +++ b/launcher_scripts/conf/fine_tuning/mt5/custom_task.yaml @@ -15,7 +15,7 @@ trainer: precision: bf16 logger: False # logger provided by exp_manager enable_checkpointing: False - replace_sampler_ddp: False + use_distributed_sampler: False max_epochs: 5 max_steps: -1 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches log_every_n_steps: 10 diff --git a/launcher_scripts/conf/fine_tuning/mt5/xquad.yaml b/launcher_scripts/conf/fine_tuning/mt5/xquad.yaml index f8d677fba5..8190e47aa5 100755 --- a/launcher_scripts/conf/fine_tuning/mt5/xquad.yaml +++ b/launcher_scripts/conf/fine_tuning/mt5/xquad.yaml @@ -15,7 +15,7 @@ trainer: precision: bf16 logger: False # logger provided by exp_manager enable_checkpointing: False - replace_sampler_ddp: False + use_distributed_sampler: False max_epochs: 5 max_steps: -1 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches log_every_n_steps: 10 diff --git a/launcher_scripts/conf/fine_tuning/t5/custom_task.yaml b/launcher_scripts/conf/fine_tuning/t5/custom_task.yaml index a3b2960f9c..54c3166405 100755 --- a/launcher_scripts/conf/fine_tuning/t5/custom_task.yaml +++ b/launcher_scripts/conf/fine_tuning/t5/custom_task.yaml @@ -15,7 +15,7 @@ trainer: precision: bf16 logger: False # logger provided by exp_manager enable_checkpointing: False - replace_sampler_ddp: False + use_distributed_sampler: False max_epochs: 5 max_steps: -1 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches log_every_n_steps: 10 diff --git a/launcher_scripts/conf/fine_tuning/t5/squad.yaml b/launcher_scripts/conf/fine_tuning/t5/squad.yaml index da5cc2c252..d608fd28ec 100755 --- a/launcher_scripts/conf/fine_tuning/t5/squad.yaml +++ b/launcher_scripts/conf/fine_tuning/t5/squad.yaml @@ -15,7 +15,7 @@ trainer: precision: bf16 logger: False # logger provided by exp_manager enable_checkpointing: False - replace_sampler_ddp: False + use_distributed_sampler: False max_epochs: 5 max_steps: -1 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches log_every_n_steps: 10 diff --git a/launcher_scripts/conf/ia3_learning/gpt3/squad.yaml b/launcher_scripts/conf/ia3_learning/gpt3/squad.yaml index 2e7ed23f3e..b5d643c94c 100755 --- a/launcher_scripts/conf/ia3_learning/gpt3/squad.yaml +++ b/launcher_scripts/conf/ia3_learning/gpt3/squad.yaml @@ -15,7 +15,7 @@ trainer: precision: bf16 logger: False enable_checkpointing: False - replace_sampler_ddp: False + use_distributed_sampler: False max_epochs: 4 max_steps: -1 log_every_n_steps: 10 diff --git a/launcher_scripts/conf/ia3_learning/t5/squad.yaml b/launcher_scripts/conf/ia3_learning/t5/squad.yaml index 840fce46b2..3e0900b058 100755 --- a/launcher_scripts/conf/ia3_learning/t5/squad.yaml +++ b/launcher_scripts/conf/ia3_learning/t5/squad.yaml @@ -15,7 +15,7 @@ trainer: precision: bf16 logger: False enable_checkpointing: False - replace_sampler_ddp: False + use_distributed_sampler: False max_epochs: 4 max_steps: -1 log_every_n_steps: 10 diff --git a/launcher_scripts/conf/prompt_learning/gpt3/squad.yaml b/launcher_scripts/conf/prompt_learning/gpt3/squad.yaml index ea42f3c4ba..32fda8389c 100755 --- a/launcher_scripts/conf/prompt_learning/gpt3/squad.yaml +++ b/launcher_scripts/conf/prompt_learning/gpt3/squad.yaml @@ -15,7 +15,7 @@ trainer: precision: bf16 logger: False enable_checkpointing: False - replace_sampler_ddp: False + use_distributed_sampler: False max_epochs: 4 max_steps: -1 log_every_n_steps: 10 diff --git a/launcher_scripts/conf/prompt_learning/mt5/squad.yaml b/launcher_scripts/conf/prompt_learning/mt5/squad.yaml index 19bf9c7447..99c9871ca8 100755 --- a/launcher_scripts/conf/prompt_learning/mt5/squad.yaml +++ b/launcher_scripts/conf/prompt_learning/mt5/squad.yaml @@ -15,7 +15,7 @@ trainer: precision: bf16 logger: False enable_checkpointing: False - replace_sampler_ddp: False + use_distributed_sampler: False max_epochs: 10 max_steps: -1 log_every_n_steps: 10 diff --git a/launcher_scripts/conf/prompt_learning/t5/squad.yaml b/launcher_scripts/conf/prompt_learning/t5/squad.yaml index 755323e938..27d54627c6 100755 --- a/launcher_scripts/conf/prompt_learning/t5/squad.yaml +++ b/launcher_scripts/conf/prompt_learning/t5/squad.yaml @@ -15,7 +15,7 @@ trainer: precision: bf16 logger: False enable_checkpointing: False - replace_sampler_ddp: False + use_distributed_sampler: False max_epochs: 10 max_steps: -1 log_every_n_steps: 10 diff --git a/launcher_scripts/conf/training/bert/100b.yaml b/launcher_scripts/conf/training/bert/100b.yaml index d63a844756..84d7170dae 100755 --- a/launcher_scripts/conf/training/bert/100b.yaml +++ b/launcher_scripts/conf/training/bert/100b.yaml @@ -14,7 +14,7 @@ trainer: precision: bf16 logger: False # logger provided by exp_manager enable_checkpointing: False - replace_sampler_ddp: False + use_distributed_sampler: False max_epochs: -1 # PTL default. In practice we don't usually train for more than 1 epoch. max_steps: 860000 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches max_time: "81:23:30:00" diff --git a/launcher_scripts/conf/training/bert/110m.yaml b/launcher_scripts/conf/training/bert/110m.yaml index 47b2e95839..8d72872eb2 100755 --- a/launcher_scripts/conf/training/bert/110m.yaml +++ b/launcher_scripts/conf/training/bert/110m.yaml @@ -14,7 +14,7 @@ trainer: precision: bf16 logger: False # logger provided by exp_manager enable_checkpointing: False - replace_sampler_ddp: False + use_distributed_sampler: False max_epochs: -1 # PTL default. In practice we don't usually train for more than 1 epoch. max_steps: 13800000 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches max_time: "7:23:30:00" diff --git a/launcher_scripts/conf/training/bert/20b.yaml b/launcher_scripts/conf/training/bert/20b.yaml index 79312130cf..729b8e0ef7 100755 --- a/launcher_scripts/conf/training/bert/20b.yaml +++ b/launcher_scripts/conf/training/bert/20b.yaml @@ -14,7 +14,7 @@ trainer: precision: bf16 logger: False # logger provided by exp_manager enable_checkpointing: False - replace_sampler_ddp: False + use_distributed_sampler: False max_epochs: -1 # PTL default. In practice we don't usually train for more than 1 epoch. max_steps: 860000 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches max_time: "90:23:30:00" diff --git a/launcher_scripts/conf/training/bert/4b.yaml b/launcher_scripts/conf/training/bert/4b.yaml index 5e435c48a2..e925f5621a 100755 --- a/launcher_scripts/conf/training/bert/4b.yaml +++ b/launcher_scripts/conf/training/bert/4b.yaml @@ -14,7 +14,7 @@ trainer: precision: bf16 logger: False # logger provided by exp_manager enable_checkpointing: False - replace_sampler_ddp: False + use_distributed_sampler: False max_epochs: -1 # PTL default. In practice we don't usually train for more than 1 epoch. max_steps: 1720000 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches max_time: "26:23:30:00" diff --git a/launcher_scripts/conf/training/gpt3/126m.yaml b/launcher_scripts/conf/training/gpt3/126m.yaml index 3e921b5bb0..affee0765e 100755 --- a/launcher_scripts/conf/training/gpt3/126m.yaml +++ b/launcher_scripts/conf/training/gpt3/126m.yaml @@ -11,7 +11,7 @@ trainer: precision: bf16 logger: False # logger provided by exp_manager enable_checkpointing: False - replace_sampler_ddp: False + use_distributed_sampler: False max_epochs: null max_steps: 600000 # consumed_samples = global_step * global_batch_size max_time: "00:23:30:00" # days:hours:minutes:seconds diff --git a/launcher_scripts/conf/training/gpt3/175b.yaml b/launcher_scripts/conf/training/gpt3/175b.yaml index c37e35e01e..493d24d516 100755 --- a/launcher_scripts/conf/training/gpt3/175b.yaml +++ b/launcher_scripts/conf/training/gpt3/175b.yaml @@ -11,7 +11,7 @@ trainer: precision: bf16 logger: False # logger provided by exp_manager enable_checkpointing: False - replace_sampler_ddp: False + use_distributed_sampler: False max_epochs: null max_steps: 75000 # consumed_samples = global_step * global_batch_size max_time: "25:23:00:00" diff --git a/launcher_scripts/conf/training/gpt3/175b_performance.yaml b/launcher_scripts/conf/training/gpt3/175b_performance.yaml index a83d4f956a..976deda501 100755 --- a/launcher_scripts/conf/training/gpt3/175b_performance.yaml +++ b/launcher_scripts/conf/training/gpt3/175b_performance.yaml @@ -14,7 +14,7 @@ trainer: precision: bf16 logger: False # logger provided by exp_manager enable_checkpointing: False - replace_sampler_ddp: False + use_distributed_sampler: False max_epochs: null max_steps: 75000 # consumed_samples = global_step * global_batch_size max_time: "25:23:00:00" diff --git a/launcher_scripts/conf/training/gpt3/1b_improved.yaml b/launcher_scripts/conf/training/gpt3/1b_improved.yaml index 25cbabb00e..1ff6b3dbf0 100644 --- a/launcher_scripts/conf/training/gpt3/1b_improved.yaml +++ b/launcher_scripts/conf/training/gpt3/1b_improved.yaml @@ -11,7 +11,7 @@ trainer: precision: bf16 logger: False # logger provided by exp_manager enable_checkpointing: False - replace_sampler_ddp: False + use_distributed_sampler: False max_epochs: null max_steps: 300000 # consumed_samples = global_step * global_batch_size max_time: "02:23:30:00" # days:hours:minutes:seconds diff --git a/launcher_scripts/conf/training/gpt3/20b.yaml b/launcher_scripts/conf/training/gpt3/20b.yaml index b8b08c3a32..e48788e197 100755 --- a/launcher_scripts/conf/training/gpt3/20b.yaml +++ b/launcher_scripts/conf/training/gpt3/20b.yaml @@ -11,7 +11,7 @@ trainer: precision: bf16 logger: False # logger provided by exp_manager enable_checkpointing: False - replace_sampler_ddp: False + use_distributed_sampler: False max_epochs: null max_steps: 75000 # consumed_samples = global_step * global_batch_size max_time: "06:23:30:00" diff --git a/launcher_scripts/conf/training/gpt3/400m_improved.yaml b/launcher_scripts/conf/training/gpt3/400m_improved.yaml index 039b9bdf01..5b1e6b915f 100644 --- a/launcher_scripts/conf/training/gpt3/400m_improved.yaml +++ b/launcher_scripts/conf/training/gpt3/400m_improved.yaml @@ -11,7 +11,7 @@ trainer: precision: bf16 logger: False # logger provided by exp_manager enable_checkpointing: False - replace_sampler_ddp: False + use_distributed_sampler: False max_epochs: null max_steps: 600000 # consumed_samples = global_step * global_batch_size max_time: "01:23:30:00" # days:hours:minutes:seconds diff --git a/launcher_scripts/conf/training/gpt3/40b.yaml b/launcher_scripts/conf/training/gpt3/40b.yaml index e2748f9b8b..84c1802bc9 100755 --- a/launcher_scripts/conf/training/gpt3/40b.yaml +++ b/launcher_scripts/conf/training/gpt3/40b.yaml @@ -11,7 +11,7 @@ trainer: precision: bf16 logger: False # logger provided by exp_manager enable_checkpointing: False - replace_sampler_ddp: False + use_distributed_sampler: False max_epochs: null max_steps: 75000 # consumed_samples = global_step * global_batch_size max_time: "6:11:00:00" diff --git a/launcher_scripts/conf/training/gpt3/40b_improved.yaml b/launcher_scripts/conf/training/gpt3/40b_improved.yaml index dad3f3e639..8686a171be 100644 --- a/launcher_scripts/conf/training/gpt3/40b_improved.yaml +++ b/launcher_scripts/conf/training/gpt3/40b_improved.yaml @@ -11,7 +11,7 @@ trainer: precision: bf16 logger: False # logger provided by exp_manager enable_checkpointing: False - replace_sampler_ddp: False + use_distributed_sampler: False max_epochs: null max_steps: 100000 # consumed_samples = global_step * global_batch_size max_time: "6:11:00:00" # days:hours:minutes:seconds diff --git a/launcher_scripts/conf/training/gpt3/5b.yaml b/launcher_scripts/conf/training/gpt3/5b.yaml index 564fb4503a..ae99d3e063 100755 --- a/launcher_scripts/conf/training/gpt3/5b.yaml +++ b/launcher_scripts/conf/training/gpt3/5b.yaml @@ -11,7 +11,7 @@ trainer: precision: bf16 logger: False # logger provided by exp_manager enable_checkpointing: False - replace_sampler_ddp: False + use_distributed_sampler: False max_epochs: null max_steps: 75000 # consumed_samples = global_step * global_batch_size max_time: "05:23:30:00" diff --git a/launcher_scripts/conf/training/gpt3/7b_improved.yaml b/launcher_scripts/conf/training/gpt3/7b_improved.yaml index 40f8d9ab88..0eec1b43ba 100644 --- a/launcher_scripts/conf/training/gpt3/7b_improved.yaml +++ b/launcher_scripts/conf/training/gpt3/7b_improved.yaml @@ -11,7 +11,7 @@ trainer: precision: bf16 logger: False # logger provided by exp_manager enable_checkpointing: False - replace_sampler_ddp: False + use_distributed_sampler: False max_epochs: null max_steps: 300000 # consumed_samples = global_step * global_batch_size max_time: "05:23:30:00" # days:hours:minutes:seconds diff --git a/launcher_scripts/conf/training/mt5/11b.yaml b/launcher_scripts/conf/training/mt5/11b.yaml index f6d6a67fc1..3111159db4 100755 --- a/launcher_scripts/conf/training/mt5/11b.yaml +++ b/launcher_scripts/conf/training/mt5/11b.yaml @@ -16,7 +16,7 @@ trainer: precision: bf16 logger: False # logger provided by exp_manager enable_checkpointing: False - replace_sampler_ddp: False + use_distributed_sampler: False max_epochs: null max_steps: 1066667 # consumed_samples = global_step * global_batch_size max_time: "44:23:30:00" diff --git a/launcher_scripts/conf/training/mt5/170m.yaml b/launcher_scripts/conf/training/mt5/170m.yaml index 49a04fc2a2..b166c26496 100755 --- a/launcher_scripts/conf/training/mt5/170m.yaml +++ b/launcher_scripts/conf/training/mt5/170m.yaml @@ -16,7 +16,7 @@ trainer: precision: bf16 logger: False # logger provided by exp_manager enable_checkpointing: False - replace_sampler_ddp: False + use_distributed_sampler: False max_epochs: null max_steps: 1000000 # consumed_samples = global_step * global_batch_size max_time: "06:23:30:00" diff --git a/launcher_scripts/conf/training/mt5/23b.yaml b/launcher_scripts/conf/training/mt5/23b.yaml index d38ea399cf..dab9d9504e 100755 --- a/launcher_scripts/conf/training/mt5/23b.yaml +++ b/launcher_scripts/conf/training/mt5/23b.yaml @@ -16,7 +16,7 @@ trainer: precision: bf16 logger: False # logger provided by exp_manager enable_checkpointing: False - replace_sampler_ddp: False + use_distributed_sampler: False max_epochs: null max_steps: 1066667 # consumed_samples = global_step * global_batch_size max_time: "54:23:30:00" diff --git a/launcher_scripts/conf/training/mt5/390m.yaml b/launcher_scripts/conf/training/mt5/390m.yaml index 479b533b3f..c03436bb8b 100755 --- a/launcher_scripts/conf/training/mt5/390m.yaml +++ b/launcher_scripts/conf/training/mt5/390m.yaml @@ -16,7 +16,7 @@ trainer: precision: bf16 logger: False # logger provided by exp_manager enable_checkpointing: False - replace_sampler_ddp: False + use_distributed_sampler: False max_epochs: null max_steps: 1000000 # consumed_samples = global_step * global_batch_size max_time: "06:23:30:00" diff --git a/launcher_scripts/conf/training/mt5/3b.yaml b/launcher_scripts/conf/training/mt5/3b.yaml index 3a0df27e4c..96b2c367bb 100755 --- a/launcher_scripts/conf/training/mt5/3b.yaml +++ b/launcher_scripts/conf/training/mt5/3b.yaml @@ -16,7 +16,7 @@ trainer: precision: bf16 logger: False # logger provided by exp_manager enable_checkpointing: False - replace_sampler_ddp: False + use_distributed_sampler: False max_epochs: null max_steps: 1066667 # consumed_samples = global_step * global_batch_size max_time: "17:23:30:00" diff --git a/launcher_scripts/conf/training/t5/11b.yaml b/launcher_scripts/conf/training/t5/11b.yaml index 9ee9b3288d..0f47b6e5e7 100755 --- a/launcher_scripts/conf/training/t5/11b.yaml +++ b/launcher_scripts/conf/training/t5/11b.yaml @@ -14,7 +14,7 @@ trainer: precision: bf16 logger: False # logger provided by exp_manager enable_checkpointing: False - replace_sampler_ddp: False + use_distributed_sampler: False max_epochs: null max_steps: 1066667 # consumed_samples = global_step * global_batch_size max_time: "44:23:30:00" diff --git a/launcher_scripts/conf/training/t5/220m.yaml b/launcher_scripts/conf/training/t5/220m.yaml index 2b1549dc8c..73f56344a5 100755 --- a/launcher_scripts/conf/training/t5/220m.yaml +++ b/launcher_scripts/conf/training/t5/220m.yaml @@ -14,7 +14,7 @@ trainer: precision: bf16 logger: False # logger provided by exp_manager enable_checkpointing: False - replace_sampler_ddp: False + use_distributed_sampler: False max_epochs: null max_steps: 1000000 # consumed_samples = global_step * global_batch_size max_time: "06:23:30:00" diff --git a/launcher_scripts/conf/training/t5/23b.yaml b/launcher_scripts/conf/training/t5/23b.yaml index 30ae8d6037..1050285cc7 100755 --- a/launcher_scripts/conf/training/t5/23b.yaml +++ b/launcher_scripts/conf/training/t5/23b.yaml @@ -14,7 +14,7 @@ trainer: precision: bf16 logger: False # logger provided by exp_manager enable_checkpointing: False - replace_sampler_ddp: False + use_distributed_sampler: False max_epochs: null max_steps: 1066667 # consumed_samples = global_step * global_batch_size max_time: "54:23:30:00" diff --git a/launcher_scripts/conf/training/t5/3b.yaml b/launcher_scripts/conf/training/t5/3b.yaml index a2f4c99e59..02c51654fc 100755 --- a/launcher_scripts/conf/training/t5/3b.yaml +++ b/launcher_scripts/conf/training/t5/3b.yaml @@ -14,7 +14,7 @@ trainer: precision: bf16 logger: False # logger provided by exp_manager enable_checkpointing: False - replace_sampler_ddp: False + use_distributed_sampler: False max_epochs: null max_steps: 1066667 # consumed_samples = global_step * global_batch_size max_time: "14:23:30:00" diff --git a/launcher_scripts/conf/training/t5/41b.yaml b/launcher_scripts/conf/training/t5/41b.yaml index 6d23f6e670..599e389f16 100755 --- a/launcher_scripts/conf/training/t5/41b.yaml +++ b/launcher_scripts/conf/training/t5/41b.yaml @@ -14,7 +14,7 @@ trainer: precision: bf16 logger: False # logger provided by exp_manager enable_checkpointing: False - replace_sampler_ddp: False + use_distributed_sampler: False max_epochs: null max_steps: 1066667 # consumed_samples = global_step * global_batch_size max_time: "99:23:30:00" From a3393e3f078c1f7693abe98539827c873af494f9 Mon Sep 17 00:00:00 2001 From: "Zhenghang (Max) Xu" Date: Mon, 7 Aug 2023 16:39:22 -0700 Subject: [PATCH 12/62] Update main.py to add PEFT stage Add PEFT stage --- launcher_scripts/main.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/launcher_scripts/main.py b/launcher_scripts/main.py index 4053328f2c..e1685e2a27 100755 --- a/launcher_scripts/main.py +++ b/launcher_scripts/main.py @@ -24,6 +24,7 @@ Conversion, EvalHarnessEvaluation, FineTuning, + PEFT, IA3Learning, NeMoEvaluation, PromptLearning, @@ -37,6 +38,7 @@ STR2STAGECLASS = { "training": Training, "fine_tuning": FineTuning, + "peft": PEFT, "prompt_learning": PromptLearning, "adapter_learning": AdapterLearning, "ia3_learning": IA3Learning, From c29586224ec8c4c65404511bcea3c72a0d8c01ac Mon Sep 17 00:00:00 2001 From: "Zhenghang (Max) Xu" Date: Mon, 7 Aug 2023 16:41:16 -0700 Subject: [PATCH 13/62] Update stages.py Add PEFT stage, which uses unified NeMo PEFT tuning script examples/nlp/language_modeling/tuning/megatron_gpt_peft_tuning.py --- launcher_scripts/nemo_launcher/core/stages.py | 39 +++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/launcher_scripts/nemo_launcher/core/stages.py b/launcher_scripts/nemo_launcher/core/stages.py index 6196a94425..ae093d0844 100755 --- a/launcher_scripts/nemo_launcher/core/stages.py +++ b/launcher_scripts/nemo_launcher/core/stages.py @@ -691,6 +691,45 @@ def _get_nemo_code_path(self, model_type: str) -> Path: } return model_type_to_code_path[model_type] +class PEFT(NeMoStage): + """Stage class of PEFT with NeMo scripts""" + + def setup_stage_vars(self, cfg): + """Setup the stage vars, i.e. stage name and stage cfg""" + self.stage_name = "peft" + self.stage_cfg = cfg.get("peft") + + def setup_folder_and_data(self) -> None: + """Setup job/data folders and fine-tuning/prompt-learning dataset""" + # Setup folders + super().setup_folder_and_data() + + # Prepare prompt learning dataset + data_dir = self.cfg.get("data_dir") + task_name = self.stage_cfg.run.get("task_name") + + # Prepare dataset for squad + if task_name in ["squad", "xquad"]: + prepare_squad_for_fine_tuning(data_dir=os.path.join(data_dir, "squad_data")) + + + def _get_nemo_code_path(self, model_type: str) -> Path: + """ + Provide the essential nemo code path for running the stage, usually different model types use different nemo scripts. + For example, `megatron_t5_pretraining.py` for t5 and `megatron_gpt_pretraining.py` for gpt3. + + :param str model_type: i.e. `gpt3`, `t5`, `mt5`, etc. + :return: path current stage's essential nemo scripts code + :rtype: Path + """ + if model_type == "t5": + raise NotImplementedError("PEFT is not supported in NeMo Megatron t5 models.") + if model_type == "mt5": + raise NotImplementedError("PEFT is not supported in NeMo Megatron mt5 models.") + model_type_to_code_path = { + "gpt3": self._nemo_code_path / "examples/nlp/language_modeling/tuning/megatron_gpt_peft_tuning.py", + } + return model_type_to_code_path[model_type] class PromptLearning(NeMoStage): """Stage class of prompt-learning with NeMo scripts""" From 75de3e66cf8b45a2741920a8761dfbb85c717836 Mon Sep 17 00:00:00 2001 From: "Zhenghang (Max) Xu" Date: Mon, 7 Aug 2023 17:14:43 -0700 Subject: [PATCH 14/62] Update config.yaml Add PEFT stage --- launcher_scripts/conf/config.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/launcher_scripts/conf/config.yaml b/launcher_scripts/conf/config.yaml index e5e1da9e0a..d27c89c934 100755 --- a/launcher_scripts/conf/config.yaml +++ b/launcher_scripts/conf/config.yaml @@ -5,6 +5,7 @@ defaults: - training: gpt3/5b - conversion: gpt3/convert_gpt3 - fine_tuning: null + - peft: null - prompt_learning: null - adapter_learning: null - ia3_learning: null @@ -59,6 +60,7 @@ numa_mapping: data_preparation_config: ${hydra:runtime.choices.data_preparation} training_config: ${hydra:runtime.choices.training} fine_tuning_config: ${hydra:runtime.choices.fine_tuning} +peft_config: ${hydra:runtime.choices.peft} prompt_learning_config: ${hydra:runtime.choices.prompt_learning} adapter_learning_config: ${hydra:runtime.choices.adapter_learning} ia3_learning_config: ${hydra:runtime.choices.ia3_learning} From 1ba30f759813e4d447bb4415d8aa351e153d320c Mon Sep 17 00:00:00 2001 From: "Zhenghang (Max) Xu" Date: Mon, 7 Aug 2023 17:19:56 -0700 Subject: [PATCH 15/62] Create squad.yaml Have peft_scheme in the file. Available options: adapter, ia3, ptuning, adapter_and_ptuning and lora. PEFT command example for BCP: python3 /opt/NeMo-Megatron-Launcher/launcher_scripts/main.py peft=gpt3/squad stages=[peft] cluster_type=bcp launcher_scripts_path=/opt/NeMo-Megatron-Launcher/launcher_scripts peft.model.peft.peft_scheme=ptuning --- launcher_scripts/conf/peft/gpt3/squad.yaml | 225 +++++++++++++++++++++ 1 file changed, 225 insertions(+) create mode 100644 launcher_scripts/conf/peft/gpt3/squad.yaml diff --git a/launcher_scripts/conf/peft/gpt3/squad.yaml b/launcher_scripts/conf/peft/gpt3/squad.yaml new file mode 100644 index 0000000000..e3da77bba7 --- /dev/null +++ b/launcher_scripts/conf/peft/gpt3/squad.yaml @@ -0,0 +1,225 @@ +name: megatron_gpt_peft_tuning-${peft.model.peft.peft_scheme} + +run: + name: ${.task_name}_${.model_train_name} + time_limit: "04:00:00" + dependency: "singleton" + convert_name: convert_nemo + model_train_name: gpt3_5b + convert_dir: ${base_results_dir}/${peft.run.model_train_name}/${peft.run.convert_name} + task_name: "squad" + results_dir: ${base_results_dir}/${.model_train_name}/peft_${.task_name} + +trainer: + devices: 8 + num_nodes: 1 + accelerator: gpu + precision: bf16 + logger: False + enable_checkpointing: False + replace_sampler_ddp: False + max_epochs: 4 + max_steps: -1 + log_every_n_steps: 10 + val_check_interval: 200 + accumulate_grad_batches: 1 + gradient_clip_val: 1.0 + resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc. + +exp_manager: + explicit_log_dir: null + exp_dir: null + name: ${peft.name} + create_wandb_logger: False + wandb_logger_kwargs: + project: null + name: null + resume_if_exists: True + resume_ignore_no_checkpoint: True + create_checkpoint_callback: True + checkpoint_callback_params: + monitor: validation_${peft.model.data.validation_ds.metric.name} + save_top_k: 1 + mode: min + save_nemo_on_train_end: True + filename: '${peft.name}--{${peft.exp_manager.checkpoint_callback_params.monitor}:.3f}-{step}-{consumed_samples}' + model_parallel_size: ${peft.model.tensor_model_parallel_size} + always_save_nemo: False + save_best_model: True + create_early_stopping_callback: True + early_stopping_callback_params: + monitor: "val_loss" + mode: "min" + min_delta: 0.001 + patience: 10 + verbose: True + strict: False # Should be False to avoid a runtime error where EarlyStopping says monitor is unavailable, which sometimes happens with resumed training. + +model: + seed: 1234 + tensor_model_parallel_size: 1 # intra-layer model parallelism + pipeline_model_parallel_size: 1 # inter-layer model parallelism + + global_batch_size: 128 + micro_batch_size: 4 + restore_from_path: ${peft.run.convert_dir}/results/megatron_gpt.nemo # Path to an existing .nemo model you wish to add new tasks to or run inference with + resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc. + save_nemo_on_validation_end: False # Saves an inference ready .nemo file every time a checkpoint is saved during training. + sync_batch_comm: False + megatron_amp_O2: False + + ## Sequence Parallelism + # Makes tensor parallelism more memory efficient for LLMs (20B+) by parallelizing layer norms and dropout sequentially + # See Reducing Activation Recomputation in Large Transformer Models: https://arxiv.org/abs/2205.05198 for more details. + sequence_parallel: False + + ## Activation Checkpoint + activations_checkpoint_granularity: null # 'selective' or 'full' + activations_checkpoint_method: null # 'uniform', 'block', not used with 'selective' + # 'uniform' divides the total number of transformer layers and checkpoints the input activation + # of each chunk at the specified granularity + # 'block' checkpoints the specified number of layers per pipeline stage at the specified granularity + activations_checkpoint_num_layers: null # not used with 'selective' + answer_only_loss: True + gradient_as_bucket_view: False + + hidden_dropout: 0.0 + attention_dropout: 0.0 + ffn_dropout: 0.0 + + peft: + peft_scheme: "ptuning" # can be either adapter,ia3, or ptuning + restore_from_path: null + + # Used for adapter peft training + adapter_tuning: + type: 'parallel_adapter' # this should be either 'parallel_adapter' or 'linear_adapter' + adapter_dim: 32 + adapter_dropout: 0.0 + norm_position: 'pre' # This can be set to 'pre' or 'post', 'pre' is normally what is used. + column_init_method: 'xavier' # IGNORED if linear_adapter is used, options: xavier, zero or normal + row_init_method: 'zero' # IGNORED if linear_adapter is used, options: xavier, zero or normal + norm_type: 'mixedfusedlayernorm' # IGNORED if layer_adapter is used, options are ['layernorm', 'mixedfusedlayernorm'] + + lora_tuning: + adapter_dim: 32 + adapter_dropout: 0.0 + column_init_method: 'xavier' # IGNORED if linear_adapter is used, options: xavier, zero or normal + row_init_method: 'zero' # IGNORED if linear_adapter is used, options: xavier, zero or normal + + # Used for p-tuning peft training + p_tuning: + virtual_tokens: 10 # The number of virtual tokens the prompt encoder should add at the start of the sequence + bottleneck_dim: 1024 # the size of the prompt encoder mlp bottleneck + embedding_dim: 1024 # the size of the prompt encoder embeddings + init_std: 0.023 + + data: + chat: False # whether use chatbot data or not + train_ds: + # Example of how to specify paths to multiple datasets + # file_names: + # - /path/to/squad.jsonl + # - /path/to/mnli.jsonl + # - /path/to/boolq.jsonl + # Example of how each dataset is formatted + # {'input': 'John von Neumann\nVon Neumann made fundamental contributions .... Q: What did the math of artificial viscosity do?', 'output': 'smoothed the shock transition without sacrificing basic physics'} + file_names: + - ${data_dir}/squad_data/v1.1/train-v1.1_gpt.json # Path to a list of JSONL files corresponding to the source data. + global_batch_size: ${peft.model.global_batch_size} + micro_batch_size: ${peft.model.micro_batch_size} + shuffle: True + num_workers: 4 + pin_memory: True + max_seq_length: 2048 + min_seq_length: 1 + drop_last: True + # Example of how to specify concat_sampling_probabilities + # concat_sampling_probabilities: + # - 0.5 + # - 0.25 + # - 0.25 + concat_sampling_probabilities: + - 1.0 # When providing a list of datasets, this arg defines the sampling probabilities from each dataset when strategy='random' + context_key: 'input' + label_key: 'output' + add_eos: True + add_sep: False + add_bos: False + separate_prompt_and_response_with_newline: True + truncation_field: "context" # Options: ['context', 'answer'] + index_mapping_dir: null # Path to a directory to write index mapping files. + prompt_template: "{input} {output}" # fstring to use for assistant prompt. Example: "Q: {input}\nA: {output}" + + validation_ds: + file_names: + - ${data_dir}/squad_data/v1.1/dev-v1.1_gpt.json # Path to a list of JSONL files corresponding to the source data. Data format is identical to train_ds. + names: + - ${peft.run.task_name} # Names of the corresponding datasets used to log metrics. + global_batch_size: ${peft.model.global_batch_size} + micro_batch_size: ${peft.model.micro_batch_size} + shuffle: True + num_workers: 4 + pin_memory: True + max_seq_length: ${peft.model.data.train_ds.max_seq_length} + min_seq_length: ${peft.model.data.train_ds.min_seq_length} + drop_last: True + context_key: 'input' + label_key: 'output' + add_eos: ${peft.model.data.train_ds.add_eos} + add_sep: ${peft.model.data.train_ds.add_sep} + add_bos: ${peft.model.data.train_ds.add_bos} + separate_prompt_and_response_with_newline: ${peft.model.data.train_ds.separate_prompt_and_response_with_newline} + write_predictions_to_file: False + output_file_path_prefix: null # Prefix of the file to write predictions to. + truncation_field: "context" # Options: ['context', 'answer'] + index_mapping_dir: null # Path to a directory to write index mapping files. + prompt_template: ${peft.model.data.train_ds.prompt_template} # fstring to use for assistant prompt. Example: "Q: {input}\nA: {output}" + + metric: + name: "loss" # Name of the evaluation metric to use. Options: ['exact_string_match', 'loss'] + average: null # Average the metric over the dataset. Options: ['macro', 'micro']. Works only for 'F1', 'accuracy' etc. Refer to torchmetrics for metrics where this is supported. + num_classes: null + + test_ds: + file_names: ${peft.model.data.validation_ds.file_names} # Path to a list of JSONL files corresponding to the source data. Data format is identical to train_ds. + names: null # Names of the corresponding datasets used to log metrics. + global_batch_size: ${peft.model.global_batch_size} + micro_batch_size: ${peft.model.micro_batch_size} + shuffle: True + num_workers: 4 + pin_memory: True + max_seq_length: ${peft.model.data.train_ds.max_seq_length} + min_seq_length: ${peft.model.data.train_ds.min_seq_length} + drop_last: True + context_key: 'input' + label_key: 'output' + add_eos: ${peft.model.data.train_ds.add_eos} + add_sep: ${peft.model.data.train_ds.add_sep} + add_bos: ${peft.model.data.train_ds.add_bos} + separate_prompt_and_response_with_newline: ${peft.model.data.train_ds.separate_prompt_and_response_with_newline} + write_predictions_to_file: False + output_file_path_prefix: null # Prefix of the file to write predictions to. + truncation_field: "context" # Options: ['context', 'answer'] + index_mapping_dir: null # Path to a directory to write index mapping files. + prompt_template: ${peft.model.data.train_ds.prompt_template} # fstring to use for assistant prompt. Example: "Q: {input}\nA: {output}" + + metric: + name: "loss" # Name of the evaluation metric to use. Options: ['exact_string_match', 'loss'] + average: null # Average the metric over the dataset. Options: ['macro', 'micro']. Works only for 'F1', 'accuracy' etc. Refer to torchmetrics for metrics where this is supported. + num_classes: null + + optim: + name: fused_adam + lr: 1e-4 + weight_decay: 0.01 + betas: + - 0.9 + - 0.98 + sched: + name: CosineAnnealing + warmup_steps: 50 + min_lr: 0.0 # min_lr must be 0.0 for prompt learning when pipeline parallel > 1 + constant_steps: 0 # Constant steps should also be 0 when min_lr=0 + monitor: val_loss + reduce_on_plateau: false From 4f1c768889e82e746d6cd966f5545144439a7326 Mon Sep 17 00:00:00 2001 From: "Zhenghang (Max) Xu" Date: Mon, 14 Aug 2023 09:59:45 -0700 Subject: [PATCH 16/62] Update README.md Update with PEFT Framework section --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 3df4288d54..8affd0ba3f 100755 --- a/README.md +++ b/README.md @@ -207,6 +207,7 @@ The most recent version of the README can be found at [https://ngc.nvidia.com/co - [5.16.2.6 PPO Hyper-parameters](#51626-ppo-hyper-parameters) + [5.16.3. Future Work](#5163-future-work) * [5.17 Curating pretraining datasets with the NeMo Data Curator](#517-curating-pretraining-datasets-with-the-nemo-data-curator) + * [5.18 Parameter-Efficient Fine-Tuning (PEFT) Framework with unified PEFT methods](#518-parameter-efficient-fine-tuning-(peft)-framework-with-unified-peft-methods) - [6. Deploying the NeMo Megatron Model](#6-deploying-the-nemo-megatron-model) * [6.1. Run NVIDIA Triton Server with Generated Model Repository](#61-run-nvidia-triton-server-with-generated-model-repository) - [6.2. GPT Text Generation with Ensemble](#62-gpt-text-generation-with-ensemble) From a30a02228c7e73b65da69b16dbb7b37731267457 Mon Sep 17 00:00:00 2001 From: "Zhenghang (Max) Xu" Date: Mon, 14 Aug 2023 10:35:28 -0700 Subject: [PATCH 17/62] Update README.md Update PEFT Framework Training with Launcher --- README.md | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 8affd0ba3f..e9c2be6e40 100755 --- a/README.md +++ b/README.md @@ -145,8 +145,10 @@ The most recent version of the README can be found at [https://ngc.nvidia.com/co * [5.12 LoRA Model and Generalized PEFT Framework](#512-lora-model-and-generalized-peft-framework) + [5.12.1 PEFT Training and Inference for GPT-style Models](#5121-peft-training-and-inference-for-gpt-style-models) - [5.12.1.1 PEFT Training and Inference](#51211-peft-training-and-inference) + - [5.12.1.2 PEFT Training with NeMo Megatron Launcher](#51212-peft-training-with-nemo-megatron-launcer) - [5.12.2 PEFT Training and Inference for mT5/T5-style Models](#5122-peft-training-and-inference-for-mt5-t5-style-models) - [5.12.2.1 PEFT Training and Inference](#51221-peft-training-and-inference) + + [5.12.1 PEFT Training and Inference for GPT-style Models](#5121-peft-training-and-inference-for-gpt-style-models) * [5.13. Model Evaluation](#513-model-evaluation) + [5.13.1. GPT Evaluation](#5131-gpt-evaluation) - [5.13.1.1. Common](#51311-common) @@ -207,7 +209,6 @@ The most recent version of the README can be found at [https://ngc.nvidia.com/co - [5.16.2.6 PPO Hyper-parameters](#51626-ppo-hyper-parameters) + [5.16.3. Future Work](#5163-future-work) * [5.17 Curating pretraining datasets with the NeMo Data Curator](#517-curating-pretraining-datasets-with-the-nemo-data-curator) - * [5.18 Parameter-Efficient Fine-Tuning (PEFT) Framework with unified PEFT methods](#518-parameter-efficient-fine-tuning-(peft)-framework-with-unified-peft-methods) - [6. Deploying the NeMo Megatron Model](#6-deploying-the-nemo-megatron-model) * [6.1. Run NVIDIA Triton Server with Generated Model Repository](#61-run-nvidia-triton-server-with-generated-model-repository) - [6.2. GPT Text Generation with Ensemble](#62-gpt-text-generation-with-ensemble) @@ -3777,6 +3778,13 @@ inference.outfile_path= ``` Additionally, NeMo has a notebook which walks through the steps (which these scripts encapsulate) to train and run inference for PEFT models: https://github.com/NVIDIA/NeMo/blob/main/tutorials/nlp/lora.ipynb +##### 5.12.1.2 PEFT Training with NeMo Megatron Launcher +PEFT stage could launch PEFT methods including PTuning, LoRA, Adapters and IA3 in a single stage, by setting different peft scheme. +It is implemented via adapter_mixins framework with a unify style. +mix-n-match PEFT scheme like adapter_and_ptuning can be easily extended to do ia3_and_ptuning or lora_and_ptuning + +PTuning does not need to flexibility to insert prompt tokens anywhere in the input. This feature has been removed for simplicity. + ##### 5.12.2 PEFT Training and Inference for mT5/T5-style Models We offer training and inference scripts in NeMo for parameter efficient tuning of mT5/T5-style models. You can train a LoRA, P-tuning, Adapter, or IA3 model using its corresponding training and inference script. From a43a0c3c28a6122b2e37eec5b160adbcc766342a Mon Sep 17 00:00:00 2001 From: "Zhenghang (Max) Xu" Date: Mon, 14 Aug 2023 10:44:01 -0700 Subject: [PATCH 18/62] edit readme for PEFT framework methods --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index e9c2be6e40..db6cc3e5c5 100755 --- a/README.md +++ b/README.md @@ -145,7 +145,7 @@ The most recent version of the README can be found at [https://ngc.nvidia.com/co * [5.12 LoRA Model and Generalized PEFT Framework](#512-lora-model-and-generalized-peft-framework) + [5.12.1 PEFT Training and Inference for GPT-style Models](#5121-peft-training-and-inference-for-gpt-style-models) - [5.12.1.1 PEFT Training and Inference](#51211-peft-training-and-inference) - - [5.12.1.2 PEFT Training with NeMo Megatron Launcher](#51212-peft-training-with-nemo-megatron-launcer) + - [5.12.1.2 PEFT Training with NeMo Megatron Launcher](#51212-peft-training-with-nemo-megatron-launcher) - [5.12.2 PEFT Training and Inference for mT5/T5-style Models](#5122-peft-training-and-inference-for-mt5-t5-style-models) - [5.12.2.1 PEFT Training and Inference](#51221-peft-training-and-inference) + [5.12.1 PEFT Training and Inference for GPT-style Models](#5121-peft-training-and-inference-for-gpt-style-models) From aa517f3d6191d57eb680f305b48b4d9b03227074 Mon Sep 17 00:00:00 2001 From: "Zhenghang (Max) Xu" Date: Mon, 14 Aug 2023 11:02:51 -0700 Subject: [PATCH 19/62] Update README.md Update PEFT Framework methods with example script --- README.md | 69 ++++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 68 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index db6cc3e5c5..b9d9c49893 100755 --- a/README.md +++ b/README.md @@ -145,7 +145,8 @@ The most recent version of the README can be found at [https://ngc.nvidia.com/co * [5.12 LoRA Model and Generalized PEFT Framework](#512-lora-model-and-generalized-peft-framework) + [5.12.1 PEFT Training and Inference for GPT-style Models](#5121-peft-training-and-inference-for-gpt-style-models) - [5.12.1.1 PEFT Training and Inference](#51211-peft-training-and-inference) - - [5.12.1.2 PEFT Training with NeMo Megatron Launcher](#51212-peft-training-with-nemo-megatron-launcher) + + [5.12.1.2 PEFT Training with NeMo Megatron Launcher](#51212-peft-training-with-nemo-megatron-launcher) + - [5.12.1.2.1 Base Command Platform](#512121-base-command-platform) - [5.12.2 PEFT Training and Inference for mT5/T5-style Models](#5122-peft-training-and-inference-for-mt5-t5-style-models) - [5.12.2.1 PEFT Training and Inference](#51221-peft-training-and-inference) + [5.12.1 PEFT Training and Inference for GPT-style Models](#5121-peft-training-and-inference-for-gpt-style-models) @@ -3785,6 +3786,72 @@ mix-n-match PEFT scheme like adapter_and_ptuning can be easily extended to do ia PTuning does not need to flexibility to insert prompt tokens anywhere in the input. This feature has been removed for simplicity. +##### 5.12.1.2.1 Base Command Platform + +In order to run the ptuning learning script on Base Command Platform, set the +`cluster_type` parameter in `conf/config.yaml` to `bcp` or `interactive`. This can also be overridden +from the command line, using hydra. + +```bash +export HYDRA_FULL_ERROR=1 +export TORCH_CPP_LOG_LEVEL=INFO NCCL_DEBUG=INFO + +TRAIN="[/mount/workspace/databricks-dolly-15k-train.jsonl]" +VALID="[/mount/workspace/databricks-dolly-15k-val.jsonl]" +VALID_NAMES="[peft-squad]" +CONCAT_SAMPLING_PROBS="[1]" + +PEFT_SCHEME="ptuning" +PEFT_EXP_DIR="/results/nemo_launcher/ptuning" +LOG_DIR="/results/nemo_launcher/ptuning_log" + +TP_SIZE=2 + +PP_SIZE=1 + +python3 /opt/NeMo-Megatron-Launcher/launcher_scripts/main.py \ + peft=gpt3/squad \ + stages=[peft] \ + cluster_type=interactive \ + launcher_scripts_path=/opt/NeMo-Megatron-Launcher/launcher_scripts \ + peft.model.peft.peft_scheme=${PEFT_SCHEME} \ + peft.trainer.precision=bf16 \ + peft.trainer.max_steps=100 \ + peft.trainer.devices=2 \ + peft.trainer.val_check_interval=10 \ + peft.model.megatron_amp_O2=False \ + peft.model.restore_from_path=/mount/workspace/nemo_gpt1.3B_fp16.nemo \ + peft.model.tensor_model_parallel_size=${TP_SIZE} \ + peft.model.pipeline_model_parallel_size=${PP_SIZE} \ + peft.model.optim.lr=5e-6 \ + peft.model.answer_only_loss=True \ + peft.model.data.train_ds.file_names=${TRAIN} \ + peft.model.data.train_ds.micro_batch_size=1 \ + peft.model.data.train_ds.global_batch_size=32 \ + peft.model.data.train_ds.concat_sampling_probabilities=${CONCAT_SAMPLING_PROBS} \ + peft.model.data.validation_ds.micro_batch_size=1 \ + peft.model.data.validation_ds.global_batch_size=32 \ + peft.model.data.validation_ds.file_names=${VALID} \ + peft.model.data.validation_ds.names=${VALID_NAMES} \ + peft.model.data.test_ds.micro_batch_size=1 \ + peft.model.data.test_ds.global_batch_size=128 \ + peft.model.data.train_ds.num_workers=0 \ + peft.model.data.validation_ds.num_workers=0 \ + peft.model.data.test_ds.num_workers=0 \ + peft.model.data.validation_ds.metric.name=loss \ + peft.model.data.test_ds.metric.name=loss \ + peft.exp_manager.exp_dir=${PEFT_EXP_DIR} \ + peft.exp_manager.explicit_log_dir=${LOG_DIR} \ + peft.exp_manager.resume_if_exists=True \ + peft.exp_manager.resume_ignore_no_checkpoint=True \ + peft.exp_manager.create_checkpoint_callback=True \ + peft.exp_manager.checkpoint_callback_params.monitor=validation_loss +``` + +The command above assumes you mounted the data workspace in `/mount/workspace/` (e.g. the example script uses databricks-dolly-15k dataset), and the results workspace in `/mount/results`. The command needs set different peft.exp_manager.exp_dir for different PEFT jobs. +The stdout and stderr outputs will also be redirected to the `/results/nemo_launcher/ptuning_log`, to be able to download the logs from NGC. +Any other parameter can also be added to the command to modify its behavior. + ##### 5.12.2 PEFT Training and Inference for mT5/T5-style Models We offer training and inference scripts in NeMo for parameter efficient tuning of mT5/T5-style models. You can train a LoRA, P-tuning, Adapter, or IA3 model using its corresponding training and inference script. From e437c6c2edee1439ea8d0e165d095eafbe92c69f Mon Sep 17 00:00:00 2001 From: "Zhenghang (Max) Xu" Date: Mon, 14 Aug 2023 11:07:24 -0700 Subject: [PATCH 20/62] Update README.md PEFT --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index b9d9c49893..3ac121e110 100755 --- a/README.md +++ b/README.md @@ -3792,6 +3792,7 @@ In order to run the ptuning learning script on Base Command Platform, set the `cluster_type` parameter in `conf/config.yaml` to `bcp` or `interactive`. This can also be overridden from the command line, using hydra. +To run the ptuning pipeline to nemo-megatron-gpt-1.3B model converted checkpoint, run: ```bash export HYDRA_FULL_ERROR=1 export TORCH_CPP_LOG_LEVEL=INFO NCCL_DEBUG=INFO From 7034b27fedf3bdc07f9ef4e2e24af06087dfbfad Mon Sep 17 00:00:00 2001 From: "Zhenghang (Max) Xu" Date: Mon, 14 Aug 2023 11:18:03 -0700 Subject: [PATCH 21/62] Update README.md Update PEFT --- README.md | 44 +++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 41 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 3ac121e110..a5ada22cf6 100755 --- a/README.md +++ b/README.md @@ -146,7 +146,8 @@ The most recent version of the README can be found at [https://ngc.nvidia.com/co + [5.12.1 PEFT Training and Inference for GPT-style Models](#5121-peft-training-and-inference-for-gpt-style-models) - [5.12.1.1 PEFT Training and Inference](#51211-peft-training-and-inference) + [5.12.1.2 PEFT Training with NeMo Megatron Launcher](#51212-peft-training-with-nemo-megatron-launcher) - - [5.12.1.2.1 Base Command Platform](#512121-base-command-platform) + - [5.12.1.2.1 Slurm](#512121-slurm) + - [5.12.1.2.2 Base Command Platform](#512122-base-command-platform) - [5.12.2 PEFT Training and Inference for mT5/T5-style Models](#5122-peft-training-and-inference-for-mt5-t5-style-models) - [5.12.2.1 PEFT Training and Inference](#51221-peft-training-and-inference) + [5.12.1 PEFT Training and Inference for GPT-style Models](#5121-peft-training-and-inference-for-gpt-style-models) @@ -3786,7 +3787,44 @@ mix-n-match PEFT scheme like adapter_and_ptuning can be easily extended to do ia PTuning does not need to flexibility to insert prompt tokens anywhere in the input. This feature has been removed for simplicity. -##### 5.12.1.2.1 Base Command Platform +##### 5.12.1.2.1 Slurm + + +Set configuration for a Slurm cluster in the `conf/cluster/bcm.yaml` file: + +```yaml +partition: null +account: null +exclusive: True +gpus_per_task: null +gpus_per_node: 8 +mem: 0 +overcommit: False +job_name_prefix: "nemo-megatron-" +``` + +**Example:** + +To run only the evaluation pipeline and not the data preparation, training, +conversion or inference pipelines set the `conf/config.yaml` file to: + +```yaml +stages: + - peft +``` + +then run: +``` +python3 main.py \ + peft=gpt3/squad \ + stages=["peft"] \ + peft.model.peft.peft_scheme="ptuning" \ + peft.model.megatron_amp_O2=False \ + peft.model.restore_from_path=${LANGUAGE_MODEL_PATH}\ + peft.exp_manager.exp_dir=${BASE_RESULTS_DIR}/${RUN_NAME}/ptuning \ + +``` +##### 5.12.1.2.2 Base Command Platform In order to run the ptuning learning script on Base Command Platform, set the `cluster_type` parameter in `conf/config.yaml` to `bcp` or `interactive`. This can also be overridden @@ -3849,7 +3887,7 @@ python3 /opt/NeMo-Megatron-Launcher/launcher_scripts/main.py \ peft.exp_manager.checkpoint_callback_params.monitor=validation_loss ``` -The command above assumes you mounted the data workspace in `/mount/workspace/` (e.g. the example script uses databricks-dolly-15k dataset), and the results workspace in `/mount/results`. The command needs set different peft.exp_manager.exp_dir for different PEFT jobs. +The command above assumes you mounted the data workspace in `/mount/workspace/` (e.g. the example script uses databricks-dolly-15k dataset), and the results workspace in `/results`. The command needs set different peft.exp_manager.exp_dir for different PEFT jobs. The stdout and stderr outputs will also be redirected to the `/results/nemo_launcher/ptuning_log`, to be able to download the logs from NGC. Any other parameter can also be added to the command to modify its behavior. From 720ad921b86870f956676781c3c239d5988f40f4 Mon Sep 17 00:00:00 2001 From: "Zhenghang (Max) Xu" Date: Mon, 14 Aug 2023 11:32:32 -0700 Subject: [PATCH 22/62] Update README.md Update Launcher with PEFT Framework and methods --- README.md | 28 ++++++++++++++++++++++++++-- 1 file changed, 26 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index a5ada22cf6..49a3b934fa 100755 --- a/README.md +++ b/README.md @@ -3787,7 +3787,31 @@ mix-n-match PEFT scheme like adapter_and_ptuning can be easily extended to do ia PTuning does not need to flexibility to insert prompt tokens anywhere in the input. This feature has been removed for simplicity. -##### 5.12.1.2.1 Slurm +##### 5.12.1.2.1. Common + +To specify the configuration for ptuning (LoRA, adapter or IA3 learning), +use all the `run` parameters to define the job specific config: +```yaml +run: + name: ${.task_name}_${.model_train_name} + time_limit: "04:00:00" + dependency: "singleton" + convert_name: convert_nemo + model_train_name: gpt3_1.3B + task_name: "squad" + results_dir: ${base_results_dir}/${.model_train_name}/ptuning_${.task_name} +``` + +To specify which language model checkpoint to load and its definition, use the `model` parameter: + +```yaml +model: + language_model_path: ${base_results_dir}/${peft.run.model_train_name}/${peft.run.convert_name}/nemo_gpt1.3B_fp16.nemo + tensor_model_parallel_size: 2 + pipeline_model_parallel_size: 1 +``` + +##### 5.12.1.2.2 Slurm Set configuration for a Slurm cluster in the `conf/cluster/bcm.yaml` file: @@ -3824,7 +3848,7 @@ python3 main.py \ peft.exp_manager.exp_dir=${BASE_RESULTS_DIR}/${RUN_NAME}/ptuning \ ``` -##### 5.12.1.2.2 Base Command Platform +##### 5.12.1.2.3 Base Command Platform In order to run the ptuning learning script on Base Command Platform, set the `cluster_type` parameter in `conf/config.yaml` to `bcp` or `interactive`. This can also be overridden From 18b76ed49b3aa2cbdbf79797acbb8fce54c1941e Mon Sep 17 00:00:00 2001 From: "Zhenghang (Max) Xu" Date: Mon, 14 Aug 2023 11:34:52 -0700 Subject: [PATCH 23/62] Update README.md Update Launcher with PEFT Framework and methods --- README.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 49a3b934fa..485a8fba30 100755 --- a/README.md +++ b/README.md @@ -146,8 +146,9 @@ The most recent version of the README can be found at [https://ngc.nvidia.com/co + [5.12.1 PEFT Training and Inference for GPT-style Models](#5121-peft-training-and-inference-for-gpt-style-models) - [5.12.1.1 PEFT Training and Inference](#51211-peft-training-and-inference) + [5.12.1.2 PEFT Training with NeMo Megatron Launcher](#51212-peft-training-with-nemo-megatron-launcher) - - [5.12.1.2.1 Slurm](#512121-slurm) - - [5.12.1.2.2 Base Command Platform](#512122-base-command-platform) + - [5.12.1.2.1 Common](#512121-common) + - [5.12.1.2.2 Slurm](#512122-slurm) + - [5.12.1.2.3 Base Command Platform](#512123-base-command-platform) - [5.12.2 PEFT Training and Inference for mT5/T5-style Models](#5122-peft-training-and-inference-for-mt5-t5-style-models) - [5.12.2.1 PEFT Training and Inference](#51221-peft-training-and-inference) + [5.12.1 PEFT Training and Inference for GPT-style Models](#5121-peft-training-and-inference-for-gpt-style-models) From f9810f0a034d2e4082a06840d5099d22893b4b14 Mon Sep 17 00:00:00 2001 From: Robert Clark Date: Fri, 19 May 2023 14:24:13 -0500 Subject: [PATCH 24/62] Add support for training GPT models on Kubernetes Adding kubernetes support for preparing datasets and training GPT-based foundation models with NeMo Framework as well as conversion to the .nemo format and model evaluation. The kubernetes support creates a Helm chart based off the cluster config settings and each task is launched as a distributed job via Helm. Currently, kubernetes support assumes the following: * Recent versions of the GPU, Network, and Kubeflow Operators are installed. * InfiniBand adapters are labeled as node resources if running multi-node jobs. * The launcher is run from a controller node with access to `kubectl` and `helm` and can launch jobs on the cluster. * The controller node has the ability to install various Python dependencies, including Hydra. * All data including the launcher scripts and results/checkpoints will be stored on an NFS filer attached to all nodes. A new k8s cluster setting and config file have been included to allow jobs to run on specific kubernetes cluster. Signed-Off-By: Robert Clark --- Dockerfile | 7 + README.md | 199 ++++++++++++++- launcher_scripts/conf/cluster/k8s.yaml | 6 + launcher_scripts/conf/config.yaml | 4 +- .../gpt3/download_gpt3_pile.yaml | 2 +- .../pile_dataprep/download.py | 2 +- .../dataprep_scripts/pile_dataprep/extract.py | 2 +- .../pile_dataprep/preprocess.py | 2 +- .../nemo_launcher/core/data_stages.py | 94 ++++++- .../nemo_launcher/core/export_stages.py | 2 +- .../core/k8s_templates/conversion/Chart.yaml | 5 + .../k8s_templates/conversion/conversion.yaml | 48 ++++ .../core/k8s_templates/conversion/values.yaml | 40 +++ .../k8s_templates/data_preparation/Chart.yaml | 5 + .../data_preparation/data-prep-config.yaml | 7 + .../data_preparation/data-prep.yaml | 59 +++++ .../data_preparation/values.yaml | 27 ++ .../core/k8s_templates/evaluation/Chart.yaml | 5 + .../evaluation/evaluation-config.yaml | 7 + .../k8s_templates/evaluation/evaluation.yaml | 53 ++++ .../core/k8s_templates/evaluation/values.yaml | 73 ++++++ .../core/k8s_templates/training/Chart.yaml | 5 + .../training/training-config.yaml | 7 + .../core/k8s_templates/training/training.yaml | 71 ++++++ .../core/k8s_templates/training/values.yaml | 28 +++ .../nemo_launcher/core/launchers.py | 68 ++++++ launcher_scripts/nemo_launcher/core/stages.py | 230 +++++++++++++++++- 27 files changed, 1036 insertions(+), 22 deletions(-) create mode 100644 launcher_scripts/conf/cluster/k8s.yaml create mode 100644 launcher_scripts/nemo_launcher/core/k8s_templates/conversion/Chart.yaml create mode 100644 launcher_scripts/nemo_launcher/core/k8s_templates/conversion/conversion.yaml create mode 100644 launcher_scripts/nemo_launcher/core/k8s_templates/conversion/values.yaml create mode 100644 launcher_scripts/nemo_launcher/core/k8s_templates/data_preparation/Chart.yaml create mode 100644 launcher_scripts/nemo_launcher/core/k8s_templates/data_preparation/data-prep-config.yaml create mode 100644 launcher_scripts/nemo_launcher/core/k8s_templates/data_preparation/data-prep.yaml create mode 100644 launcher_scripts/nemo_launcher/core/k8s_templates/data_preparation/values.yaml create mode 100644 launcher_scripts/nemo_launcher/core/k8s_templates/evaluation/Chart.yaml create mode 100644 launcher_scripts/nemo_launcher/core/k8s_templates/evaluation/evaluation-config.yaml create mode 100644 launcher_scripts/nemo_launcher/core/k8s_templates/evaluation/evaluation.yaml create mode 100644 launcher_scripts/nemo_launcher/core/k8s_templates/evaluation/values.yaml create mode 100644 launcher_scripts/nemo_launcher/core/k8s_templates/training/Chart.yaml create mode 100644 launcher_scripts/nemo_launcher/core/k8s_templates/training/training-config.yaml create mode 100644 launcher_scripts/nemo_launcher/core/k8s_templates/training/training.yaml create mode 100644 launcher_scripts/nemo_launcher/core/k8s_templates/training/values.yaml diff --git a/Dockerfile b/Dockerfile index e0144fa03d..b250d0dbe0 100644 --- a/Dockerfile +++ b/Dockerfile @@ -68,6 +68,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \ libsndfile1 \ sox \ swig \ + openssh-server \ libb64-dev && \ rm -rf /var/lib/apt/lists/* @@ -179,6 +180,12 @@ RUN pip install --no-cache-dir wandb==0.15.3 \ # Copy FasterTransformer COPY --from=ft_builder /workspace/FasterTransformer FasterTransformer +# Setup SSH config to allow mpi-operator to communicate with containers in k8s +RUN echo " UserKnownHostsFile /dev/null" >> /etc/ssh/ssh_config && \ + sed -i 's/#\(StrictModes \).*/\1no/g' /etc/ssh/sshd_config && \ + sed -i 's/# StrictHostKeyChecking ask/ StrictHostKeyChecking no/' /etc/ssh/ssh_config && \ + mkdir -p /var/run/sshd + # Examples WORKDIR /workspace #COPY any user-facing example scripts should go in here diff --git a/README.md b/README.md index 3df4288d54..dbe58feb64 100755 --- a/README.md +++ b/README.md @@ -19,6 +19,7 @@ The most recent version of the README can be found at [https://ngc.nvidia.com/co + [4.1.1. Common](#411-common) + [4.1.2. OCI](#412-oci) + [4.1.3. AWS](#413-aws) + + [4.1.4. Kubernetes](#414-k8s) * [4.2. Cluster Validation](#42-cluster-validation) + [4.2.1. Validation Script Usage](#421-validation-script-usage) + [4.2.2 Running tests manually](#422-running-tests-manually) @@ -32,12 +33,14 @@ The most recent version of the README can be found at [https://ngc.nvidia.com/co + [5.1.1. Prepare Environment](#511-prepare-environment) - [5.1.1.1. Slurm](#5111-slurm) - [5.1.1.2. Base Command Platform](#5112-base-command-platform) - - [5.1.1.3. General Configuration](#5113-general-configuration) + - [5.1.1.3. Kubernetes](#5113-kubernetes) + - [5.1.1.4. General Configuration](#5114-general-configuration) + [5.1.2. Data Preparation](#512-data-preparation) - [5.1.2.1. Data Preparation for GPT Models](#5121-data-preparation-for-gpt-models) * [5.1.2.1.1. Slurm](#51211-slurm) * [5.1.2.1.2. Base Command Platform](#51212-base-command-platform) - * [5.1.2.1.3. Common](#51213-common) + * [5.1.2.1.3. Kubernetes](#51213-kubernetes) + * [5.1.2.1.4. Common](#51214-common) - [5.1.2.2. Data Preparation for T5 Models](#5122-data-preparation-for-t5-models) * [5.1.2.2.1. Slurm](#51221-slurm) * [5.1.2.2.2. Base Command Platform](#51222-base-command-platform) @@ -85,6 +88,7 @@ The most recent version of the README can be found at [https://ngc.nvidia.com/co + [5.6.1. GPT Training](#561-gpt-training) - [5.6.1.1. Slurm](#5611-slurm) - [5.6.1.2. Base Command Platform](#5612-base-command-platform) + - [5.6.1.3. Kubernetes](#5613-base-command-platform) + [5.6.2. T5 Training](#562-t5-training) - [5.6.2.1. Slurm](#5621-slurm) - [5.6.2.2. Base Command Platform](#5622-base-command-platform) @@ -100,6 +104,7 @@ The most recent version of the README can be found at [https://ngc.nvidia.com/co - [5.8.1.1. Common](#5811-common) - [5.8.1.2. Slurm](#5812-slurm) - [5.8.1.3. Base Command Platform](#5813-base-command-platform) + - [5.8.1.4. Kubernetes](#5814-kubernetes) + [5.8.2. T5 Conversion](#582-t5-conversion) - [5.8.2.1. Common](#5821-common) - [5.8.2.2. Slurm](#5822-slurm) @@ -152,7 +157,8 @@ The most recent version of the README can be found at [https://ngc.nvidia.com/co - [5.13.1.1. Common](#51311-common) - [5.13.1.2. Slurm](#51312-slurm) - [5.13.1.3. Base Command Platform](#51313-base-command-platform) - - [5.13.1.4 Interleaved Pipeline Parallelism](#51314-interleaved-pipeline-parallelism) + - [5.13.1.4. Kubernetes](#51314-kubernetes) + - [5.13.1.5 Interleaved Pipeline Parallelism](#51314-interleaved-pipeline-parallelism) + [5.13.2. T5 Evaluation](#5132-t5-evaluation) - [5.13.2.1. Common](#51321-common) - [5.13.2.2. Slurm](#51322-slurm) @@ -371,6 +377,11 @@ Figure 1: The GPT family architecture. The 5B variant includes 24 transformer la | HPC-X | 2.13 | | Base Command Manager | 1.0.0 | | DeepOps | 21.06 | +| Kubernetes | 1.27.4 | +| Helm | 3.12.1 | +| GPU Operator | 23.3.2 | +| Network Operator | 23.1.0 | +| KubeFlow Operator | 1.6.0 | ## 4. Cloud Service Providers @@ -421,6 +432,23 @@ On the scheduler node: container: /path/to/nemo_megatron_launcher/nemo_megatron_training.sqsh ``` +#### 4.1.4. Kubernetes + +Data preparation and training GPT models is currently supported on vanilla kubernetes (k8s) clusters. +The launcher scripts will generate a Helm chart for each task based on the config files and launch the job using the chart. + +The following is required for running jobs on Kubernetes: + * One or more DGX A100s/H100s as worker nodes + * An NFS filesystem where the data and launcher scripts will be stored which is accessible on all worker and controller nodes + * A head/controller node which has access to the worker nodes and can run `kubectl` and `helm` to launch jobs and can install Python dependencies + * Recent versions of the GPU, Network, and KubeFlow Operators installed + +A secret key needs to be configured to allow kubernetes to pull from the private registry. For example, if pulling the container directly +from NGC, a secret needs to be created to authenticate with the private NGC registry, such as the following: +``` +kubectl create secret docker-registry ngc-registry --docker-server=nvcr.io --docker-username=\$oauthtoken --docker-password= +``` + ### 4.2. Cluster Validation @@ -604,7 +632,22 @@ creating these workspaces (e.g. `nemo_megatron_data_ws` and `nemo_megatron_resul the Base Command Platform User Guide for how to create and work with Base Command Platform workspaces. -##### 5.1.1.3. General Configuration +##### 5.1.1.3. Kubernetes + + +The launcher scripts need to be downloaded to the NFS filesystem that is +connected to the worker nodes. This can either be copied at +`/opt/NeMo-Megatron-Launcher` from inside the training container or by cloning +this repository. + +Install the NeMo Framework scripts dependencies on the head node/controller of +the cluster where jobs will be launched: + +``` +pip install -r requirements.txt +``` + +##### 5.1.1.4. General Configuration The first parameter that must be set is the `launcher_scripts_path` parameter inside the @@ -852,8 +895,36 @@ The command above assumes you want to prepare the entire dataset (files 0-29), a workspace in `/mount/data`, and the results workspace in `/mount/results`. Stdout and stderr are redirected to the `/results/data_gpt3_log.txt` file, so it can be downloaded from NGC. Any other parameter can also be added to the command to modify its behavior. -###### 5.1.2.1.3. Common - +###### 5.1.2.1.3. Kubernetes + + +To run data preparation on a kubernetes cluster, set both the `cluster` and +`cluster_type` parameters to `k8s` in `conf/config.yaml`. Additionally, set the +`launcher_scripts_path` parameter to the location where the launcher scripts +are located on the NFS filesystem. This must be the same path on all nodes in +the cluster. Ensure the `stages` parameter is set to `data_preparation` and +`data_preparation` in the `defaults` section points to the intended data +preparation script. + +The `conf/config/k8s.yaml` file also needs to be updated with the +kubernetes container registry secret if created earlier (`pull_secret`), the +`shm_size` to determine how much local memory to put in each pod, and the NFS +server and path to where the launcher scripts are saved. These can all be +overridden from the command line using hydra as well. + +Once all of the config files are updated, the data preparation can be launched +from the controller node with: + +``` +python main.py +``` + +This will generate and launch a job via Helm in the default namespace which +can be viewed with `helm show` or `kubectl get pods`. The logs can be followed +with `kubectl logs `. + +###### 5.1.2.1.4. Common + Set the configuration for the data preparation job for GPT models in the YAML file: ```yaml @@ -2462,6 +2533,89 @@ Select the cluster related configuration following the NGC documentation. Then, use the `python3 main.py` command to launch the job and override the desired parameters from the training job parameters. +##### 5.6.1.3. Kubernetes + + +Set configuration for your Kubernetes cluster in the `conf/cluster/k8s.yaml` file: + +```yaml +pull_secret: null +shm_size: 512Gi +nfs_server: null +nfs_path: null +ib_resource_name: "nvidia.com/hostdev" +ib_count: "8" +``` + +The settings are as follows: + * `pull_secret`: The name of the sercret key created with `kubectl` that will + be used to authenticate with private registries for pulling the training + container. + * `shm_size`: The amount of shared memory to include in the Pods. It is + recommended to use a large value here. + * `nfs_server`: The IP address or hostname of the NFS server that the worker + nodes will read and write data to/from. + * `nfs_path`: The absolute path on the NFS server that should be mounted + inside the Pods. + * `ib_resource_name`: The name of the IB interconnect to attach to Pods for + multi-node training. This is the name that Kubernetes assigns to the NICs as + allocatable resources. + * `ib_count`: The number of IB interconnects to include per node in each pod. + This will likely equal the total number of active/usable compute NICs per + node. + +And set the training job specific parameters in the `conf/training/(model_type)/(model_size).yaml` file, +using the run section: +```yaml +run: + name: gpt3_126m + results_dir: ${base_results_dir}/${.name} + time_limit: "1-12:00:00" + dependency: "singleton" +``` + +To run only the training pipeline and not the data preparation, evaluation or +inference pipelines, set the `conf/config.yaml` file to: + +```yaml +stages: + - training +``` + +Also set the `cluster` and `cluster_type` values to `k8s` in the +`conf/config.yaml` file. + +And then run: +``` +python3 main.py +``` + +Once the launcher is run, it will display the path to the Helm chart that was +generated based on the updated config files. The Helm chart will be located in +the job results directory by default. The chart will be run automatically and +Pods will be started by Kubernetes once resources become available. The status +of the Helm chart can be checked with: + +``` +$ helm list +NAME NAMESPACE REVISION UPDATED STATUS CHART APP VERSION +gpt-7b-improved default 1 2023-07-17 14:10:11.794541205 -0700 PDT deployed nemo-framework-training-1.0.0 1.0 +``` + +Once allocated, this will spin up N pods for N number of nodes requested. To +view training progress follow the log of the first pod, typically named +`nlp-training-worker-0`. + +Once a job is finished, it will be marked as complete via Helm and can be +uninstalled with (note - replace `` with the name of the Helm chart +as shown in the previous example): + +``` +$ helm uninstall +``` + +The uninstallation will not affect the completed job - it will only mark the +resources as free for Kubernetes to use them for future tasks. #### 5.6.2. T5 Training @@ -2749,6 +2903,22 @@ The command above assumes you mounted the data workspace in `/mount/data`, and t The stdout and stderr outputs will also be redirected to the `/results/convert_gpt3_log.txt` file, to be able to download the logs from NGC. Any other parameter can also be added to the command to modify its behavior. +##### 5.8.1.4. Kubernetes + +To convert a model to the `.nemo` format on a Kubernetes cluster, set both the +`cluster` and `cluster_type` parameters to `k8s` in `conf/config.yaml`. Update +the `conf/conversion/gpt3/convert_gpt3.yaml` config file to point to the model +you would like to convert. + +Once the configs are ready, run: + +``` +python3 main.py +``` + +This will launch a Helm chart that will spawn a job that runs on one of the +compute nodes to convert the requested model to the `.nemo` format. + #### 5.8.2. T5 Conversion @@ -3928,7 +4098,22 @@ The command above assumes you mounted the data workspace in `/mount/data`, and t The stdout and stderr outputs will also be redirected to the `/results/eval_gpt3_log.txt` file, to be able to download the logs from NGC. Any other parameter can also be added to the command to modify its behavior. -##### 5.13.1.4 Interleaved Pipeline Parallelism +##### 5.13.1.4. Kubernetes + +To evaluate base models on Kubernetes clusters, set the `cluster` and +`cluster_type` parameters to `k8s` in `conf/config.yaml`. Update either the +`conf/evaluation/gpt3/evaluate_all.yaml` or `conf/evaluation/gpt3/evaluate_lambada.yaml` +file based on your cluster and desired evaluation tasks. Once the configurations +are updated, launch an evaluation job with: + +``` +python3 main.py +``` + +This will launch a Helm chart based on the evaluation configurations which will +download all task files and run evaluation against the specified model. + +##### 5.13.1.5 Interleaved Pipeline Parallelism If your model was trained with interleaved pipeline parallelism, then the model must converted to a non-interleaved model. In order to check if your model used interleaved, inspect the training config and verify that diff --git a/launcher_scripts/conf/cluster/k8s.yaml b/launcher_scripts/conf/cluster/k8s.yaml new file mode 100644 index 0000000000..d609fb3901 --- /dev/null +++ b/launcher_scripts/conf/cluster/k8s.yaml @@ -0,0 +1,6 @@ +pull_secret: null # Kubernetes secret for the container registry to pull private containers. +shm_size: 512Gi # Amount of system memory to allocate in Pods. Should end in "Gi" for gigabytes. +nfs_server: null # Hostname or IP address for the NFS server where data is stored. +nfs_path: null # Path to store data in the NFS server. +ib_resource_name: "nvidia.com/hostdev" # Specify the resource name for IB devices according to kubernetes, such as "nvidia.com/hostdev" for Mellanox IB adapters. +ib_count: "8" # Specify the number of IB devices to include per node in each pod. diff --git a/launcher_scripts/conf/config.yaml b/launcher_scripts/conf/config.yaml index e5e1da9e0a..6d9fd9356d 100755 --- a/launcher_scripts/conf/config.yaml +++ b/launcher_scripts/conf/config.yaml @@ -1,6 +1,6 @@ defaults: - _self_ - - cluster: bcm # Leave it as bcm even if using bcp. It will be ignored for bcp. + - cluster: bcm # Set to bcm for BCM and BCP clusters. Set to k8s for a k8s cluster. - data_preparation: gpt3/download_gpt3_pile - training: gpt3/5b - conversion: gpt3/convert_gpt3 @@ -25,7 +25,7 @@ stages: - evaluation - export -cluster_type: bcm # bcm or bcp. If bcm, it must match - cluster above. +cluster_type: bcm # bcm, bcp, or k8s. If bcm or k8s, it must match - cluster above. launcher_scripts_path: ??? # Path to NeMo Megatron Launch scripts, should ends with /launcher_scripts data_dir: ${launcher_scripts_path}/data # Location to store and read the data. base_results_dir: ${launcher_scripts_path}/results # Location to store the results, checkpoints and logs. diff --git a/launcher_scripts/conf/data_preparation/gpt3/download_gpt3_pile.yaml b/launcher_scripts/conf/data_preparation/gpt3/download_gpt3_pile.yaml index 632ccdadd2..ab6614480a 100755 --- a/launcher_scripts/conf/data_preparation/gpt3/download_gpt3_pile.yaml +++ b/launcher_scripts/conf/data_preparation/gpt3/download_gpt3_pile.yaml @@ -9,7 +9,7 @@ run: dataset: pile download_the_pile: True # Whether to download the pile dataset from the internet. -the_pile_url: "https://mystic.the-eye.eu/public/AI/pile/train/" # Source URL to download The Pile dataset from. +the_pile_url: "https://the-eye.eu/public/AI/pile/train/" # Source URL to download The Pile dataset from. file_numbers: "0-29" # The pile dataset consists of 30 files (0-29), choose which ones to download. preprocess_data: True # True to preprocess the data from a jsonl file, False otherwise. download_vocab_url: "https://huggingface.co/gpt2/resolve/main/vocab.json" # URL to download the vocab from. diff --git a/launcher_scripts/nemo_launcher/collections/dataprep_scripts/pile_dataprep/download.py b/launcher_scripts/nemo_launcher/collections/dataprep_scripts/pile_dataprep/download.py index 917961a51a..80831b3960 100755 --- a/launcher_scripts/nemo_launcher/collections/dataprep_scripts/pile_dataprep/download.py +++ b/launcher_scripts/nemo_launcher/collections/dataprep_scripts/pile_dataprep/download.py @@ -35,7 +35,7 @@ def main(cfg): url = f"{pile_url_train}{file_number:02d}.jsonl.zst" output_file = f"{file_number:02d}.jsonl.zst" downloaded_path = utils.download_single_file(url, data_dir, output_file) - if cfg.get("cluster_type") == "bcp": + if cfg.get("cluster_type") in ["bcp", "k8s"]: file_numbers = cfg["file_numbers"] # Downloading the files files_list = utils.convert_file_numbers(file_numbers) diff --git a/launcher_scripts/nemo_launcher/collections/dataprep_scripts/pile_dataprep/extract.py b/launcher_scripts/nemo_launcher/collections/dataprep_scripts/pile_dataprep/extract.py index 5093543528..16fef5ef28 100755 --- a/launcher_scripts/nemo_launcher/collections/dataprep_scripts/pile_dataprep/extract.py +++ b/launcher_scripts/nemo_launcher/collections/dataprep_scripts/pile_dataprep/extract.py @@ -35,7 +35,7 @@ def main(cfg) -> None: downloaded_path = os.path.join(data_dir, f"{file_number:02d}.jsonl.zst") output_file = f"{file_number:02d}.jsonl" utils.extract_single_zst_file(downloaded_path, data_dir, output_file, rm_downloaded) - elif cfg.get("cluster_type") == "bcp": + elif cfg.get("cluster_type") in ["bcp", "k8s"]: file_numbers = cfg.get("file_numbers") # Downloading the files files_list = utils.convert_file_numbers(file_numbers) diff --git a/launcher_scripts/nemo_launcher/collections/dataprep_scripts/pile_dataprep/preprocess.py b/launcher_scripts/nemo_launcher/collections/dataprep_scripts/pile_dataprep/preprocess.py index 61a9e36560..2117a27d5c 100755 --- a/launcher_scripts/nemo_launcher/collections/dataprep_scripts/pile_dataprep/preprocess.py +++ b/launcher_scripts/nemo_launcher/collections/dataprep_scripts/pile_dataprep/preprocess.py @@ -91,7 +91,7 @@ def main(cfg): os.system(runcmd) if rm_extracted: os.remove(extracted_path) - elif cfg.get("cluster_type") == "bcp": + elif cfg.get("cluster_type") in ["bcp", "k8s"]: file_numbers = cfg.get("file_numbers") files_list = utils.convert_file_numbers(file_numbers) # Assumes launched via mpirun: diff --git a/launcher_scripts/nemo_launcher/core/data_stages.py b/launcher_scripts/nemo_launcher/core/data_stages.py index c3713786e5..b33ece406b 100755 --- a/launcher_scripts/nemo_launcher/core/data_stages.py +++ b/launcher_scripts/nemo_launcher/core/data_stages.py @@ -16,11 +16,13 @@ import os from pathlib import Path from typing import Dict, List, Optional - import omegaconf +import shutil + from nemo_launcher.core.launchers import AutoLauncher from nemo_launcher.core.stages import NemoMegatronStage, clean_command_groups, create_args_list from nemo_launcher.utils.file_utils import download_single_file +from nemo_launcher.utils.job_utils import JobPaths class DataStage(NemoMegatronStage): @@ -55,7 +57,7 @@ def run(self) -> str: job_path = self.get_job_path(sub_stage) job_path.folder.mkdir(parents=True, exist_ok=True) - stage_cfg_path = NemoMegatronStage.save_stage_hydra_config(self.stage_cfg, job_path) + stage_cfg_path = NemoMegatronStage.save_stage_hydra_config(self.stage_cfg, job_path, self.cfg) if job_id: dependency = f"aftercorr:{job_id}" self.stage_cfg["run"]["dependency"] = dependency @@ -65,9 +67,24 @@ def run(self) -> str: # Make command groups command_groups = self.make_stage_command_groups(stage_cfg_path, sub_stage) + + # Prepare Helm chart for k8s + if self.cluster == 'k8s': + template_root = os.path.join(os.path.abspath(os.path.dirname(__file__)), 'k8s_templates/data_preparation') + self._make_k8s_helm_chart(template_root, cluster_parameters, job_path, sub_stage) + # Create launcher launcher = AutoLauncher(folder=job_path.folder, cluster=self.cluster, **cluster_parameters,) - job_id = launcher.launch(command_groups=command_groups) + + if self.cluster == 'k8s': + # For k8s clusters, only launch on the final stage (preprocess) as + # the Helm chart contains all stages in a single chart. + if sub_stage == sub_stages[-1]: + job_id = launcher.launch(command_groups=command_groups) + else: + job_id = '' + else: + job_id = launcher.launch(command_groups=command_groups) return job_id @@ -97,11 +114,11 @@ def _make_private_cluster_parameters(self, cluster, sub_stage): def _make_cluster_parameters(self, cluster: str, sub_stage: Optional = None,) -> Dict: """ Make a cluster-specific parameters for jobs on different clusters. - Current clusters include bcm(slurm), bcp and interactive. + Current clusters include bcm(slurm), bcp, k8s, and interactive. For example for bcm, it will return slurm parameters: {'job_name': 'some_name', 'nodes': 2, 'ntasks_per_node': 8, ...} - :param str cluster: i.e. `bcm`, `bcp`, `interactive`, etc. + :param str cluster: i.e. `bcm`, `bcp`, `interactive`, `k8s`, etc. :param Optional sub_stage: current sub_stage name :return: a dictionary of cluster parameters, e.g. `ntasks_per_node` :rtype: Dict @@ -142,11 +159,78 @@ def _make_cluster_parameters(self, cluster: str, sub_stage: Optional = None,) -> cluster_parameters.update( {**shared_parameters, **private_parameters,} ) + elif cluster == "k8s": + cluster_cfg = cfg.get("cluster") + container_image = cfg.get("container") + k8s_cfg = {**copy.deepcopy(cluster_cfg)} + + cluster_parameters = {**k8s_cfg} + + cluster_parameters.update( + { + **shared_parameters, + **private_parameters, + "container_image": container_image,} + ) elif cluster == "interactive": raise ValueError("Data preparation is not supported in interactive mode.") return cluster_parameters + def _make_k8s_helm_chart(self, template_root: str, cluster_parameters: Dict, job_path: JobPaths, sub_stage: str): + """ + Create a Helm chart for data preparation. + The Helm chart uses a base template which is extended with user-defined + cluster settings as specified in the config files. The generated Hydra + config file needs to be copied to the Helm chart as this will be used + for launching the job. + + :param str template_root: the path to where the k8s template files are located. + :param dict cluster_parameters: additional parameters specific to the cluster config. + :param JobPaths job_path: the path to the job results directory. + :param str sub_stage: the current stage. + """ + with open(os.path.join(template_root, 'values.yaml')) as value_file: + values_template = omegaconf.OmegaConf.load(value_file) + + procs_per_node = self.stage_cfg.run.bcp_preproc_npernode if sub_stage == "preprocess" else 1 + total_processes = procs_per_node * self.stage_cfg.run.node_array_size + + # Update the Helm chart template with the user-specified settings + values_template.image.trainingImage = cluster_parameters['container_image'] + values_template.image.pullSecret = cluster_parameters['pull_secret'] + values_template.image.nodes = self.stage_cfg.run.node_array_size + values_template.dataPrepConfig.shmSize = cluster_parameters['shm_size'] + values_template.dataPrepConfig.NFSServer = cluster_parameters['nfs_server'] + values_template.dataPrepConfig.NFSPath = cluster_parameters['nfs_path'] + values_template.dataPrepConfig.totalProcesses = total_processes + values_template.dataPrepConfig.procsPerNode = procs_per_node + values_template.dataPrepConfig.stage = sub_stage + + k8s_template_path = job_path.folder + k8s_template_file = Path(k8s_template_path / 'k8s_template' / 'values.yaml') + k8s_template_file.parent.mkdir(parents=True, exist_ok=True) + + conf = omegaconf.OmegaConf.create(values_template) + omegaconf.OmegaConf.save(conf, k8s_template_file) + + # Copy the data prep spec files to the Helm chart + template_file = os.path.join(template_root, 'data-prep.yaml') + chart_file = os.path.join(template_root, 'Chart.yaml') + data_prep_path = Path(job_path.folder / 'k8s_template' / 'templates' / 'data-prep.yaml') + data_prep_path.parent.mkdir(parents=True, exist_ok=True) + config_path = Path(job_path.folder / 'k8s_template' / 'config') + config_path.mkdir(parents=True, exist_ok=True) + chart_path = Path(job_path.folder / 'k8s_template' / 'Chart.yaml') + data_prep_config_file = os.path.join(template_root, 'data-prep-config.yaml') + data_prep_config_path = Path(job_path.folder / 'k8s_template' / 'templates' / 'data-prep-config.yaml') + hydra_config_path = Path(job_path.folder / 'k8s_template' / 'config') + + shutil.copy2(template_file, data_prep_path) + shutil.copy2(chart_file, chart_path) + shutil.copy2(data_prep_config_file, data_prep_config_path) + shutil.copy2(job_path.config_file, hydra_config_path) + class PileDataPreparation(DataStage): """DataStage for preparing the Pile dataset for gpt3 and t5""" diff --git a/launcher_scripts/nemo_launcher/core/export_stages.py b/launcher_scripts/nemo_launcher/core/export_stages.py index dea2296db6..171e7d2c29 100755 --- a/launcher_scripts/nemo_launcher/core/export_stages.py +++ b/launcher_scripts/nemo_launcher/core/export_stages.py @@ -108,7 +108,7 @@ def run(self) -> str: job_path = self.get_job_path(sub_stage) job_path.folder.mkdir(parents=True, exist_ok=True) - stage_cfg_path = NemoMegatronStage.save_stage_hydra_config(self.stage_cfg, job_path) + stage_cfg_path = NemoMegatronStage.save_stage_hydra_config(self.stage_cfg, job_path, self.cfg) if job_id: dependency = f"aftercorr:{job_id}" self.stage_cfg["run"]["dependency"] = dependency diff --git a/launcher_scripts/nemo_launcher/core/k8s_templates/conversion/Chart.yaml b/launcher_scripts/nemo_launcher/core/k8s_templates/conversion/Chart.yaml new file mode 100644 index 0000000000..bbf3651743 --- /dev/null +++ b/launcher_scripts/nemo_launcher/core/k8s_templates/conversion/Chart.yaml @@ -0,0 +1,5 @@ +apiVersion: v2 +appVersion: "1.0" +description: NeMo Framework Base Model Conversion +name: nemo-framework-conversion +version: 1.0.0 diff --git a/launcher_scripts/nemo_launcher/core/k8s_templates/conversion/conversion.yaml b/launcher_scripts/nemo_launcher/core/k8s_templates/conversion/conversion.yaml new file mode 100644 index 0000000000..214e14df69 --- /dev/null +++ b/launcher_scripts/nemo_launcher/core/k8s_templates/conversion/conversion.yaml @@ -0,0 +1,48 @@ +{{ $config := .Values.trainingConfig }} + +apiVersion: batch/v1 +kind: Job +metadata: + name: nlp-conversion + labels: + app: nlp-conversion +spec: + template: + spec: + containers: + - name: nlp-conversion + image: {{ .Values.image.trainingImage }} + env: + - name: NCCL_AVOID_RECORD_STREAMS + value: "1" + command: ["/bin/bash", "-c"] + args: + - 'export CKPT_NAME=$(python3 {{ $config.launcherScriptsPath }}/nemo_launcher/collections/checkpoint_search.py checkpoint_folder={{ $config.trainingDirectory }}/results/checkpoints checkpoint_name=latest tensor_model_parallel_size=1 pipeline_model_parallel_size=1) && + echo ${CKPT_NAME} && + python3 {{ $config.launcherScriptsPath }}/nemo_launcher/collections/hparams_override.py hparams_file={{ $config.trainingDirectory }}/results/hparams.yaml output_path={{ $config.resultsDirectory }}/results vocab_file={{ $config.vocabPath }} merge_file={{ $config.mergesPath }} tokenizer_model=None && + python3 /opt/NeMo/examples/nlp/language_modeling/megatron_ckpt_to_nemo.py --gpus_per_node=1 --model_type=gpt --checkpoint_folder={{ $config.trainingDirectory }}/results/checkpoints --checkpoint_name=${CKPT_NAME} --hparams_file={{ $config.resultsDirectory }}/results/hparams_override.yaml --nemo_file_path={{ $config.resultsDirectory }}/megatron_gpt.nemo --tensor_model_parallel_size={{ $config.tensorParallelism }} --pipeline_model_parallel_size={{ $config.pipelineParallelism }}' + imagePullPolicy: Always + resources: + requests: + nvidia.com/gpu: {{ .Values.image.gpuNum }} + limits: + nvidia.com/gpu: {{ .Values.image.gpuNum }} + volumeMounts: + - mountPath: {{ $config.NFSPath }} + name: workspace + - mountPath: /dev/shm + name: dshm + restartPolicy: Never + imagePullSecrets: + - name: {{ .Values.image.pullSecret }} + + volumes: + - name: workspace + nfs: + server: {{ $config.NFSServer }} + path: {{ $config.NFSPath }} + + - name: dshm + emptyDir: + medium: Memory + sizeLimit: {{ $config.shmSize }} diff --git a/launcher_scripts/nemo_launcher/core/k8s_templates/conversion/values.yaml b/launcher_scripts/nemo_launcher/core/k8s_templates/conversion/values.yaml new file mode 100644 index 0000000000..21df8fd095 --- /dev/null +++ b/launcher_scripts/nemo_launcher/core/k8s_templates/conversion/values.yaml @@ -0,0 +1,40 @@ +image: + trainingImage: cfg.container + pullPolicy: IfNotPresent + + # Insert the name of your container registry pull secret # + pullSecret: nvcr.io + + # Insert number of GPUs # + gpuNum: + +trainingConfig: + # Specify the amount of shared memory to attach to the Pods # + shmSize: 512Gi + + # Insert the address for the NFS server if using NFS for model storage # + NFSServer: + + # Insert the path to save data on the NFS server # + NFSPath: + + # Insert the path to the vocab file # + vocabPath: + + # Insert the path to the merges file # + mergesPath: + + # Insert the path to the results directory # + resultsDirectory: + + # Insert the path to the training directory # + trainingDirectory: + + # Insert the path to the launcher_scripts directory # + launcherScriptsPath: + + # Insert the TP size # + tensorParallelism: + + # Insert the PP size # + pipelineParallelism: diff --git a/launcher_scripts/nemo_launcher/core/k8s_templates/data_preparation/Chart.yaml b/launcher_scripts/nemo_launcher/core/k8s_templates/data_preparation/Chart.yaml new file mode 100644 index 0000000000..d2337c69ac --- /dev/null +++ b/launcher_scripts/nemo_launcher/core/k8s_templates/data_preparation/Chart.yaml @@ -0,0 +1,5 @@ +apiVersion: v2 +appVersion: "1.0" +description: NeMo Framework Data Preparation +name: nemo-framework-data-prep +version: 1.0.0 diff --git a/launcher_scripts/nemo_launcher/core/k8s_templates/data_preparation/data-prep-config.yaml b/launcher_scripts/nemo_launcher/core/k8s_templates/data_preparation/data-prep-config.yaml new file mode 100644 index 0000000000..338acfb9a5 --- /dev/null +++ b/launcher_scripts/nemo_launcher/core/k8s_templates/data_preparation/data-prep-config.yaml @@ -0,0 +1,7 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: data-prep-config +data: + config.yaml: |- + {{ (.Files.Glob "config/*hydra.yaml").AsConfig | indent 4 }} diff --git a/launcher_scripts/nemo_launcher/core/k8s_templates/data_preparation/data-prep.yaml b/launcher_scripts/nemo_launcher/core/k8s_templates/data_preparation/data-prep.yaml new file mode 100644 index 0000000000..8ab7a76207 --- /dev/null +++ b/launcher_scripts/nemo_launcher/core/k8s_templates/data_preparation/data-prep.yaml @@ -0,0 +1,59 @@ +{{ $config := .Values.dataPrepConfig }} + +apiVersion: kubeflow.org/v2beta1 +kind: MPIJob +metadata: + name: nlp-data-prep + labels: + app: nlp-data-prep +spec: + mpiReplicaSpecs: + Launcher: + replicas: 1 + template: + spec: + containers: + - name: nlp-data-prep + image: {{ .Values.image.trainingImage }} + command: ["bash", "-c"] + args: + - '{{- range tuple "download" "extract" "preprocess" }} mpirun --allow-run-as-root -np {{ $config.totalProcesses }} -npernode {{ $config.procsPerNode }} -bind-to none -map-by slot --oversubscribe -x PYTHONPATH -mca pml ob1 -mca btl ^openib python3 /opt/NeMo-Megatron-Launcher/launcher_scripts/nemo_launcher/collections/dataprep_scripts/pile_dataprep/{{ . }}.py --config-path=/config --config-name=config.yaml && {{- end}} echo Data preparation complete' + imagePullPolicy: Always + imagePullSecrets: + - name: {{ .Values.image.pullSecret }} + Worker: + replicas: {{ .Values.image.nodes }} + template: + spec: + containers: + - name: nlp-data-prep + image: {{ .Values.image.trainingImage }} + command: ["/usr/sbin/sshd"] + args: + - "-De" + volumeMounts: + - mountPath: {{ $config.NFSPath }} + name: workspace + - mountPath: /dev/shm + name: dshm + - mountPath: /config + name: data-prep-config + imagePullPolicy: Always + restartPolicy: Never + imagePullSecrets: + - name: {{ .Values.image.pullSecret }} + + volumes: + - name: workspace + nfs: + server: {{ $config.NFSServer }} + path: {{ $config.NFSPath }} + + - name: dshm + emptyDir: + medium: Memory + sizeLimit: {{ $config.shmSize }} + + - configMap: + name: data-prep-config + name: data-prep-config diff --git a/launcher_scripts/nemo_launcher/core/k8s_templates/data_preparation/values.yaml b/launcher_scripts/nemo_launcher/core/k8s_templates/data_preparation/values.yaml new file mode 100644 index 0000000000..e5a8bc7987 --- /dev/null +++ b/launcher_scripts/nemo_launcher/core/k8s_templates/data_preparation/values.yaml @@ -0,0 +1,27 @@ +image: + trainingImage: cfg.container + pullPolicy: IfNotPresent + + # Insert the name of your container registry pull secret # + pullSecret: nvcr.io + + nodes: training.trainer.num_nodes + +dataPrepConfig: + # Specify the amount of shared memory to attach to the Pods # + shmSize: 512Gi + + # Insert the address for the NFS server if using NFS for model storage # + NFSServer: + + # Insert the path to save data on the NFS server # + NFSPath: + + # Insert the total number of processes to spawn on the cluster # + totalProcesses: + + # Insert the number of processes to spawn per node # + procsPerNode: + + # Insert the data preparation stage, such as download, extract, or preprocess # + stage: diff --git a/launcher_scripts/nemo_launcher/core/k8s_templates/evaluation/Chart.yaml b/launcher_scripts/nemo_launcher/core/k8s_templates/evaluation/Chart.yaml new file mode 100644 index 0000000000..4c291917f1 --- /dev/null +++ b/launcher_scripts/nemo_launcher/core/k8s_templates/evaluation/Chart.yaml @@ -0,0 +1,5 @@ +apiVersion: v2 +appVersion: "1.0" +description: NeMo Framework Evaluation +name: nemo-framework-evaluation +version: 1.0.0 diff --git a/launcher_scripts/nemo_launcher/core/k8s_templates/evaluation/evaluation-config.yaml b/launcher_scripts/nemo_launcher/core/k8s_templates/evaluation/evaluation-config.yaml new file mode 100644 index 0000000000..080bbcc6b3 --- /dev/null +++ b/launcher_scripts/nemo_launcher/core/k8s_templates/evaluation/evaluation-config.yaml @@ -0,0 +1,7 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: evaluation-config +data: + hparams.yaml: |- + {{ (.Files.Glob "config/hparams.yaml").AsConfig | indent 4 }} diff --git a/launcher_scripts/nemo_launcher/core/k8s_templates/evaluation/evaluation.yaml b/launcher_scripts/nemo_launcher/core/k8s_templates/evaluation/evaluation.yaml new file mode 100644 index 0000000000..7278d1385e --- /dev/null +++ b/launcher_scripts/nemo_launcher/core/k8s_templates/evaluation/evaluation.yaml @@ -0,0 +1,53 @@ +{{ $config := .Values.trainingConfig }} + +apiVersion: batch/v1 +kind: Job +metadata: + name: nlp-evaluation + labels: + app: nlp-evaluation +spec: + template: + spec: + containers: + - name: nlp-evaluation + image: {{ .Values.image.trainingImage }} + env: + - name: NCCL_AVOID_RECORD_STREAMS + value: "1" + command: ["/bin/bash", "-c"] + args: + - 'python3 {{ $config.launcherScriptsPath }}/nemo_launcher/collections/eval_harness/download.py --tasks=all_tasks --cache-dir={{ $config.cacheDir }} && + mkdir -p {{ $config.outputPath }} && + python3 {{ $config.launcherScriptsPath }}/nemo_launcher/collections/eval_harness/evaluate.py --name={{ $config.name }} --model={{ $config.model }} --tasks={{ $config.tasks }} --cache_dir={{ $config.cacheDir }} --output_path={{ $config.outputPath }} --batch_size={{ $config.batchSize }} --tensor_model_parallel_size={{ $config.tensorParallelism }} --pipeline_model_parallel_size={{ $config.pipelineParallelism }} --precision={{ $config.precision }} --vocab_file={{ $config.vocabPath }} --merge_file={{ $config.mergesPath }} {{- if $config.nemoModel }} --nemo_model={{ $config.nemoModel }}{{ end }} --checkpoint_folder={{ $config.checkpointFolder }} --checkpoint_name={{ $config.checkpointName }} --hparams_file=/config/hparams.yaml' + imagePullPolicy: Always + resources: + requests: + nvidia.com/gpu: {{ .Values.image.gpuNum }} + limits: + nvidia.com/gpu: {{ .Values.image.gpuNum }} + volumeMounts: + - mountPath: {{ $config.NFSPath }} + name: workspace + - mountPath: /dev/shm + name: dshm + - mountPath: /config + name: evaluation-config + restartPolicy: Never + imagePullSecrets: + - name: {{ .Values.image.pullSecret }} + + volumes: + - name: workspace + nfs: + server: {{ $config.NFSServer }} + path: {{ $config.NFSPath }} + + - name: dshm + emptyDir: + medium: Memory + sizeLimit: {{ $config.shmSize }} + + - configMap: + name: evaluation-config + name: evaluation-config diff --git a/launcher_scripts/nemo_launcher/core/k8s_templates/evaluation/values.yaml b/launcher_scripts/nemo_launcher/core/k8s_templates/evaluation/values.yaml new file mode 100644 index 0000000000..0fcfe4c835 --- /dev/null +++ b/launcher_scripts/nemo_launcher/core/k8s_templates/evaluation/values.yaml @@ -0,0 +1,73 @@ +image: + trainingImage: cfg.container + pullPolicy: IfNotPresent + + # Insert the name of your container registry pull secret # + pullSecret: nvcr.io + + # Insert number of GPUs # + gpuNum: 1 + +trainingConfig: + # Specify the amount of shared memory to attach to the Pods # + shmSize: 512Gi + + # Insert the address for the NFS server if using NFS for model storage # + NFSServer: + + # Insert the path to save data on the NFS server # + NFSPath: + + # Insert the path to the vocab file # + vocabPath: + + # Insert the path to the merges file # + mergesPath: + + # Insert the path to the results directory # + resultsDirectory: + + # Insert the path to the training directory # + trainingDirectory: + + # Insert the path to the launcher_scripts directory # + launcherScriptsPath: + + # Insert the TP size # + tensorParallelism: + + # Insert the PP size # + pipelineParallelism: + + # Insert evaluation task name # + name: + + # Insert name of model to evaluate # + model: + + # Insert which tasks to evaluate # + tasks: + + # Insert path to store downloaded eval data # + cacheDir: + + # Insert path to save evaluation results # + outputPath: + + # Insert batch size for evaluation # + batchSize: + + # Insert evaluation precision # + precision: + + # Specify the path to the .nemo model if used # + nemoModel: + + # Insert path the the training checkpoint directory # + checkpointFolder: + + # Insert name of checkpoint or "latest" # + checkpointName: + + # Insert path to the hparams file from the training job # + hparamsFile: diff --git a/launcher_scripts/nemo_launcher/core/k8s_templates/training/Chart.yaml b/launcher_scripts/nemo_launcher/core/k8s_templates/training/Chart.yaml new file mode 100644 index 0000000000..e2314f8ec3 --- /dev/null +++ b/launcher_scripts/nemo_launcher/core/k8s_templates/training/Chart.yaml @@ -0,0 +1,5 @@ +apiVersion: v2 +appVersion: "1.0" +description: NeMo Framework Base Model Training +name: nemo-framework-training +version: 1.0.0 diff --git a/launcher_scripts/nemo_launcher/core/k8s_templates/training/training-config.yaml b/launcher_scripts/nemo_launcher/core/k8s_templates/training/training-config.yaml new file mode 100644 index 0000000000..ce3095184c --- /dev/null +++ b/launcher_scripts/nemo_launcher/core/k8s_templates/training/training-config.yaml @@ -0,0 +1,7 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: training-config +data: + config.yaml: |- + {{ (.Files.Glob "config/*hydra.yaml").AsConfig | indent 4 }} diff --git a/launcher_scripts/nemo_launcher/core/k8s_templates/training/training.yaml b/launcher_scripts/nemo_launcher/core/k8s_templates/training/training.yaml new file mode 100644 index 0000000000..37f37a1317 --- /dev/null +++ b/launcher_scripts/nemo_launcher/core/k8s_templates/training/training.yaml @@ -0,0 +1,71 @@ +{{ $config := .Values.trainingConfig }} + +apiVersion: kubeflow.org/v1 +kind: PyTorchJob +metadata: + name: nlp-training + labels: + app: nlp-training +spec: + pytorchReplicaSpecs: + Worker: + replicas: {{ .Values.image.nodes }} + template: + spec: + containers: + - name: pytorch + image: {{ .Values.image.trainingImage }} + env: + - name: NCCL_AVOID_RECORD_STREAMS + value: "1" + {{ if eq $config.wandbKey "nil" }} + command: ["torchrun"] + args: + - "--nnodes={{ .Values.image.nodes }}" + - "--rdzv-backend=c10d" + - "--rdzv-endpoint=nlp-training-worker-0" + - "--nproc_per_node={{ .Values.image.numGPUs }}" + - "/opt/NeMo/examples/nlp/language_modeling/megatron_gpt_pretraining.py" + - "--config-path=/config" + - "--config-name=config.yaml" + {{ else }} + command: ["bash", "-c"] + args: + - "wandb login {{ $config.wandbKey }} && torchrun --nnodes={{ .Values.image.nodes }} --rdzv-backend=c10d --rdzv-endpoint=nlp-training-worker-0 --nproc_per_node={{ .Values.image.numGPUs }} /opt/NeMo/examples/nlp/language_modeling/megatron_gpt_pretraining.py --config-path=/config --config-name=config.yaml" + {{ end }} + imagePullPolicy: Always + securityContext: + capabilities: + add: [ "IPC_LOCK" ] + resources: + requests: + nvidia.com/gpu: {{ .Values.image.numGPUs }} + {{ $config.ibResourceName }}: {{ $config.ibCount }} + limits: + nvidia.com/gpu: {{ .Values.image.numGPUs }} + {{ $config.ibResourceName }}: {{ $config.ibCount }} + volumeMounts: + - mountPath: {{ $config.NFSPath }} + name: workspace + - mountPath: /dev/shm + name: dshm + - mountPath: /config + name: training-config + restartPolicy: Never + imagePullSecrets: + - name: {{ .Values.image.pullSecret }} + + volumes: + - name: workspace + nfs: + server: {{ $config.NFSServer }} + path: {{ $config.NFSPath }} + + - name: dshm + emptyDir: + medium: Memory + sizeLimit: {{ $config.shmSize }} + + - configMap: + name: training-config + name: training-config diff --git a/launcher_scripts/nemo_launcher/core/k8s_templates/training/values.yaml b/launcher_scripts/nemo_launcher/core/k8s_templates/training/values.yaml new file mode 100644 index 0000000000..553be55b19 --- /dev/null +++ b/launcher_scripts/nemo_launcher/core/k8s_templates/training/values.yaml @@ -0,0 +1,28 @@ +image: + trainingImage: cfg.container + pullPolicy: IfNotPresent + + # Insert the name of your container registry pull secret # + pullSecret: nvcr.io + + numGPUs: training.trainer.devices + nodes: training.trainer.num_nodes + +trainingConfig: + # Specify the amount of shared memory to attach to the Pods # + shmSize: 512Gi + + # Insert the address for the NFS server if using NFS for model storage # + NFSServer: + + # Insert the path to save data on the NFS server # + NFSPath: + + # Specify the k8s resource name for IB devices # + ibResourceName: nvidia.com/hostdev + + # Specity the number of IB devices to include in pods # + ibCount: "8" + + # Specify the WandB API key if using WandB for logging # + wandbKey: "nil" diff --git a/launcher_scripts/nemo_launcher/core/launchers.py b/launcher_scripts/nemo_launcher/core/launchers.py index f31fb7bba8..6e6d497337 100755 --- a/launcher_scripts/nemo_launcher/core/launchers.py +++ b/launcher_scripts/nemo_launcher/core/launchers.py @@ -21,6 +21,8 @@ import shlex import shutil import warnings +from omegaconf import OmegaConf, DictConfig +import yaml from pathlib import Path from typing import Any, Dict, Iterable, List, Optional, Set, Union @@ -70,6 +72,7 @@ def get_launchers(): "bcm": SlurmLauncher, "bcp": BCPLauncher, "interactive": InteractiveLauncher, + "k8s": K8SLauncher, } @@ -114,6 +117,7 @@ def _make_submission_file(self, command_groups: List[List[str]]) -> Path: on interactive cluster, it's a bash file, trigger with bash. on slurm cluster, it's a slurm script file, trigger with sbatch. on BCP cluster, it's a BCP script file, trigger with bash. + on k8s cluster, it's a Helm chart, triggered with helm. :param List[List[str]] command_groups: Command groups to launch with :return: job id on slurm based system otherwise empty string @@ -431,6 +435,70 @@ def _get_job_id_from_submission_command(string: Union[bytes, str]) -> str: return output.group("id") +class K8SLauncher(Launcher): + """ + K8s job launcher + This class is used to hold the parameters to run a job on kubernetes. + In practice, it will create a Helm chart in the specified directory for the job + and trigger the job with `bash` command. + + :param Union[Path, str] folder: folder for storing job submission/output and logs. + :param str job_name: Name of the job, used as job folder name + :param Any **kwargs: Parse other cluster parameters required for k8s running, + including `nodes`, `ntasks_pernode`, `bcp_launcher`, etc. + """ + + def __init__(self, folder: Union[Path, str], job_name: str, **kwargs: Any) -> None: + super().__init__(folder, job_name) + self.parameters = kwargs + self.parameters = self._convert_parameters(self.parameters) + + @classmethod + def _equivalence_dict(cls): + return { + "name": "job_name", + "nodes": "nnodes", + "tasks_per_node": "npernode", + "ntasks_per_node": "npernode", + } + + def _convert_parameters(self, params: Dict[str, Any]) -> Dict[str, Any]: + """translate k8s parameter names""" + # replace type in some cases + eq_dict = self._equivalence_dict() + if eq_dict is not None: + params = {eq_dict.get(k, k): v for k, v in params.items()} + return params + + def _submit_command(self, submission_file_path: Path) -> str: + """Launch the submission command""" + command_list = self._make_submission_command(submission_file_path) + # run + job_utils.CommandFunction(command_list, ret_stdout=False, verbose=False)() # explicit errors + return "" + + @staticmethod + def _make_submission_command(submission_file_path: Path) -> List[str]: + """Make a command to trigger submission script. On a k8s cluster, the script is triggerred with Helm""" + return ["bash", str(submission_file_path)] + + def _make_submission_file_text(self, command_groups: List[List[str]]) -> str: + """ + Generate the script to launch the Helm chart. + A very simple bash script is generated which runs `helm install` for the + Helm chart that was generated. + + :param List[List[str]] command_groups: Command groups to launch with + :return: submission script file's text + :rtype: str + """ + paths = job_utils.JobPaths(folder=self.folder, job_name=self.job_name) + helm_charts = paths.folder / 'k8s_template' + job_name = self.job_name.replace('_', '-') + + return f'#!/bin/bash\nhelm install {job_name} {helm_charts}\n' + + @functools.lru_cache() def _get_default_parameters() -> Dict[str, Any]: """Parameters that can be set through update_parameters""" diff --git a/launcher_scripts/nemo_launcher/core/stages.py b/launcher_scripts/nemo_launcher/core/stages.py index 56404956e1..bcc5cf3f26 100755 --- a/launcher_scripts/nemo_launcher/core/stages.py +++ b/launcher_scripts/nemo_launcher/core/stages.py @@ -19,6 +19,7 @@ import json import re from pathlib import Path +import shutil from typing import Any, Dict, List, Optional import omegaconf @@ -28,7 +29,7 @@ prepare_squad_for_prompt_learning, ) from nemo_launcher.utils.job_utils import JobPaths -from omegaconf import OmegaConf +from omegaconf import OmegaConf, DictConfig class NemoMegatronStage: @@ -73,9 +74,14 @@ def run(self) -> str: self.cfg['training']["trainer"]["num_nodes"] = nodes logging.info(f"global batch size and number of nodes will change following this schedule:\n {self.nodes_scheduler}") - stage_cfg_path = NemoMegatronStage.save_stage_hydra_config(self.stage_cfg, job_path) + stage_cfg_path = NemoMegatronStage.save_stage_hydra_config(self.stage_cfg, job_path, self.cfg) # Make cluster parameters cluster_parameters = self._make_cluster_parameters(self.cluster) + # Make k8s config file if necessary + if self.cluster == 'k8s': + template_root = os.path.join(os.path.abspath(os.path.dirname(__file__)), f'k8s_templates/{self.stage_name}') + self._make_k8s_spec_file(template_root, cluster_parameters, job_path) + self._copy_k8s_helm_chart(template_root, job_path) # Make command groups command_groups = self.make_stage_command_groups(stage_cfg_path) # Create launcher @@ -92,15 +98,30 @@ def setup_folder_and_data(self) -> None: results_folder.mkdir(parents=True, exist_ok=True) @staticmethod - def save_stage_hydra_config(stage_cfg: OmegaConf, job_path: JobPaths) -> Path: + def save_stage_hydra_config(stage_cfg: OmegaConf, job_path: JobPaths, cfg: OmegaConf) -> Path: """ Interpolate and save hydra config file for current stage :param OmegaConf stage_cfg: current stage's hydra configuration :param JobPaths job_path: JobPaths object + :param OmegaConf cfg: base config for job :return: path current stage's essential nemo scripts code :rtype: Path """ + # Since k8s uses a Helm chart that launches a job based on the Hydra config + # file, the Hydra config file that is generated needs to contain all of the + # required keys for each stage. + if cfg.cluster_type == "k8s": + # OmegaConf doesn't allow adding new keys. Temporarily create a dictionary + # representation and add the new keys before converting back to an + # OmegaConf object. + temp_config = OmegaConf.to_object(stage_cfg) + temp_config['data_dir'] = cfg.data_dir + temp_config['cluster_type'] = cfg.cluster_type + temp_config['launcher_scripts_path'] = cfg.launcher_scripts_path + temp_config['data_config'] = stage_cfg.run.name + stage_cfg = OmegaConf.create(temp_config) + _hydra_interpolation(stage_cfg) cfg_save_path = job_path.config_file @@ -139,6 +160,10 @@ def _make_nemo_path_command(self) -> List[str]: f'export PYTHONPATH={self._nemo_code_path}:\${{PYTHONPATH}}', ] + def _make_k8s_spec_file(self, template_root: str, cluster_parameters: Dict, job_path: JobPaths): + """Create a yaml spec file for kubernetes jobs""" + raise NotImplementedError + # def _make_numa_mapping_command(self) -> List[str]: # """Make a command of numa mapping call""" # cfg = self.cfg @@ -285,6 +310,17 @@ def _make_cluster_parameters(self, cluster: str) -> Dict: ) elif cluster == "interactive": cluster_parameters.update(shared_parameters) + elif cluster == "k8s": + cluster_cfg = cfg.get("cluster") + k8s_cfg = {**copy.deepcopy(cluster_cfg)} + + cluster_parameters = {**k8s_cfg} + cluster_parameters.update( + { + **shared_parameters, + "container_image": container_image, + } + ) return cluster_parameters @@ -540,6 +576,72 @@ def _make_hydra_override(self) -> List: if self.cluster == "bcp": hydra_override += ["+rank=\${RANK}"] return hydra_override + + def _copy_k8s_helm_chart(self, template_root: str, job_path: JobPaths): + """ + Copy the k8s Helm charts to the results directory. + + :param str template_root: path to where the k8s template files are located + :param JobPaths job_path: JobPaths object + """ + template_file = os.path.join(template_root, 'training.yaml') + chart_file = os.path.join(template_root, 'Chart.yaml') + training_path = Path(job_path.folder / 'k8s_template' / 'templates' / 'training.yaml') + training_path.parent.mkdir(parents=True, exist_ok=True) + config_path = Path(job_path.folder / 'k8s_template' / 'config') + config_path.mkdir(parents=True, exist_ok=True) + chart_path = Path(job_path.folder / 'k8s_template' / 'Chart.yaml') + training_config_file = os.path.join(template_root, 'training-config.yaml') + training_config_path = Path(job_path.folder / 'k8s_template' / 'templates' / 'training-config.yaml') + hydra_config_path = Path(job_path.folder / 'k8s_template' / 'config') + + shutil.copy2(template_file, training_path) + shutil.copy2(chart_file, chart_path) + shutil.copy2(training_config_file, training_config_path) + shutil.copy2(job_path.config_file, hydra_config_path) + + def _add_wandb_key_to_chart(self) -> str: + """ + Read the WandB API key file and return it to be placed in the Helm chart. + + :return: a string of the WandB API key. + :rtype: str + """ + with open(self.cfg.wandb_api_key_file, "r") as f: + wandb_api_key = f.readline().rstrip() + return wandb_api_key + + def _make_k8s_spec_file(self, template_root: str, cluster_parameters: Dict, job_path: JobPaths): + """ + Create a spec file for a Kubernetes training job. + The spec file is generated based on the parameters in the cluster and training config files. + + :param str template_root: path to where the k8s template files are located + :param Dict cluster_parameters: settings specific to the cluster that is being used + :param JobPaths job_path: JobPaths object + """ + with open(os.path.join(template_root, 'values.yaml')) as value_file: + values_template = OmegaConf.load(value_file) + + values_template.image.trainingImage = cluster_parameters['container_image'] + values_template.image.pullSecret = cluster_parameters['pull_secret'] + values_template.image.numGPUs = self.stage_cfg.trainer.devices + values_template.image.nodes = self.stage_cfg.trainer.num_nodes + values_template.trainingConfig.shmSize = cluster_parameters['shm_size'] + values_template.trainingConfig.NFSServer = cluster_parameters['nfs_server'] + values_template.trainingConfig.NFSPath = cluster_parameters['nfs_path'] + values_template.trainingConfig.ibResourceName = cluster_parameters['ib_resource_name'] + values_template.trainingConfig.ibCount = cluster_parameters['ib_count'] + + if self.cfg.wandb_api_key_file is not None: + values_template.trainingConfig.wandbKey = self._add_wandb_key_to_chart() + + k8s_template_path = job_path.folder + k8s_template_file = Path(k8s_template_path / 'k8s_template' / 'values.yaml') + k8s_template_file.parent.mkdir(parents=True, exist_ok=True) + + conf = OmegaConf.create(values_template) + OmegaConf.save(conf, k8s_template_file) def get_env_vars(self) -> Dict: """ @@ -814,6 +916,57 @@ def _make_checkpoint_search_command(self, **kwargs: Any) -> str: f"{' '.join(checkpoint_override)}" ) + def _make_k8s_spec_file(self, template_root: str, cluster_parameters: Dict, job_path: JobPaths): + """ + Create a spec file for a Kubernetes conversion job. + The spec file is generated based on the parameters in the cluster and conversion config files. + + :param str template_root: path to where the k8s template files are located + :param Dict cluster_parameters: settings specific to the cluster that is being used + :param JobPaths job_path: JobPaths object + """ + with open(os.path.join(template_root, 'values.yaml')) as value_file: + values_template = OmegaConf.load(value_file) + + num_gpus = self.cfg.conversion.model.pipeline_model_parallel_size * self.cfg.conversion.model.tensor_model_parallel_size + + values_template.image.trainingImage = cluster_parameters['container_image'] + values_template.image.pullSecret = cluster_parameters['pull_secret'] + values_template.image.gpuNum = num_gpus + values_template.trainingConfig.shmSize = cluster_parameters['shm_size'] + values_template.trainingConfig.NFSServer = cluster_parameters['nfs_server'] + values_template.trainingConfig.NFSPath = cluster_parameters['nfs_path'] + values_template.trainingConfig.vocabPath = self.cfg.conversion.model.vocab_file + values_template.trainingConfig.mergesPath = self.cfg.conversion.model.merge_file + values_template.trainingConfig.resultsDirectory = str(job_path.folder) + values_template.trainingConfig.trainingDirectory = self.cfg.conversion.run.train_dir + values_template.trainingConfig.launcherScriptsPath = self.cfg.launcher_scripts_path + values_template.trainingConfig.tensorParallelism = self.cfg.conversion.model.tensor_model_parallel_size + values_template.trainingConfig.pipelineParallelism = self.cfg.conversion.model.pipeline_model_parallel_size + + k8s_template_path = job_path.folder + k8s_template_file = Path(k8s_template_path / 'k8s_template' / 'values.yaml') + k8s_template_file.parent.mkdir(parents=True, exist_ok=True) + + conf = OmegaConf.create(values_template) + OmegaConf.save(conf, k8s_template_file) + + def _copy_k8s_helm_chart(self, template_root: str, job_path: JobPaths): + """ + Copy the k8s Helm charts to the results directory. + + :param str template_root: path to where the k8s template files are located + :param JobPaths job_path: JobPaths object + """ + template_file = os.path.join(template_root, 'conversion.yaml') + chart_file = os.path.join(template_root, 'Chart.yaml') + conversion_path = Path(job_path.folder / 'k8s_template' / 'templates' / 'conversion.yaml') + conversion_path.parent.mkdir(parents=True, exist_ok=True) + chart_path = Path(job_path.folder / 'k8s_template' / 'Chart.yaml') + + shutil.copy2(template_file, conversion_path) + shutil.copy2(chart_file, chart_path) + def make_stage_command_groups(self, stage_cfg_path: Path) -> List[List[str]]: """ Make the command groups for current stage @@ -987,6 +1140,77 @@ def _make_download_command_string(self) -> str: download_command_string = " \\\n ".join(download_command) return download_command_string + def _make_k8s_spec_file(self, template_root: str, cluster_parameters: Dict, job_path: JobPaths): + """ + Create a spec file for a Kubernetes conversion job. + The spec file is generated based on the parameters in the cluster and conversion config files. + + :param str template_root: path to where the k8s template files are located + :param Dict cluster_parameters: settings specific to the cluster that is being used + :param JobPaths job_path: JobPaths object + """ + with open(os.path.join(template_root, 'values.yaml')) as value_file: + values_template = OmegaConf.load(value_file) + + num_gpus = self.cfg.evaluation.model.pipeline_model_parallel_size * self.cfg.evaluation.model.tensor_model_parallel_size + + values_template.image.trainingImage = cluster_parameters['container_image'] + values_template.image.pullSecret = cluster_parameters['pull_secret'] + values_template.image.gpuNum = num_gpus + values_template.trainingConfig.shmSize = cluster_parameters['shm_size'] + values_template.trainingConfig.NFSServer = cluster_parameters['nfs_server'] + values_template.trainingConfig.NFSPath = cluster_parameters['nfs_path'] + values_template.trainingConfig.vocabPath = self.cfg.evaluation.model.vocab_file + values_template.trainingConfig.mergesPath = self.cfg.evaluation.model.merge_file + values_template.trainingConfig.resultsDirectory = str(job_path.folder) + values_template.trainingConfig.trainingDirectory = self.cfg.evaluation.run.train_dir + values_template.trainingConfig.launcherScriptsPath = self.cfg.launcher_scripts_path + values_template.trainingConfig.tensorParallelism = self.cfg.evaluation.model.tensor_model_parallel_size + values_template.trainingConfig.pipelineParallelism = self.cfg.evaluation.model.pipeline_model_parallel_size + values_template.trainingConfig.name = self.cfg.evaluation.run.name + values_template.trainingConfig.model = self.cfg.evaluation.model.model_type + values_template.trainingConfig.cacheDir = os.path.join(self.cfg.data_dir, 'eval_harness_data') + values_template.trainingConfig.outputPath = os.path.join(self.cfg.evaluation.run.results_dir, + self.cfg.evaluation.run.eval_name, + 'results') + values_template.trainingConfig.batchSize = self.cfg.evaluation.model.eval_batch_size + values_template.trainingConfig.precision = self.cfg.evaluation.model.precision + values_template.trainingConfig.nemoModel = self.cfg.evaluation.model.nemo_model + values_template.trainingConfig.checkpointFolder = self.cfg.evaluation.model.checkpoint_folder + values_template.trainingConfig.checkpointName = self.cfg.evaluation.model.checkpoint_name + values_template.trainingConfig.hparamsFile = self.cfg.evaluation.model.hparams_file + values_template.trainingConfig.tasks = self.cfg.evaluation.run.tasks + + k8s_template_path = job_path.folder + k8s_template_file = Path(k8s_template_path / 'k8s_template' / 'values.yaml') + k8s_template_file.parent.mkdir(parents=True, exist_ok=True) + + conf = OmegaConf.create(values_template) + OmegaConf.save(conf, k8s_template_file) + + def _copy_k8s_helm_chart(self, template_root: str, job_path: JobPaths): + """ + Copy the k8s Helm charts to the results directory. + + :param str template_root: path to where the k8s template files are located + :param JobPaths job_path: JobPaths object + """ + template_file = os.path.join(template_root, 'evaluation.yaml') + chart_file = os.path.join(template_root, 'Chart.yaml') + evaluation_path = Path(job_path.folder / 'k8s_template' / 'templates' / 'evaluation.yaml') + evaluation_path.parent.mkdir(parents=True, exist_ok=True) + config_path = Path(job_path.folder / 'k8s_template' / 'config') + config_path.mkdir(parents=True, exist_ok=True) + chart_path = Path(job_path.folder / 'k8s_template' / 'Chart.yaml') + evaluation_config_file = os.path.join(template_root, 'evaluation-config.yaml') + evaluation_config_path = Path(job_path.folder / 'k8s_template' / 'templates' / 'evaluation-config.yaml') + hparams_config_path = Path(job_path.folder / 'k8s_template' / 'config') + + shutil.copy2(template_file, evaluation_path) + shutil.copy2(chart_file, chart_path) + shutil.copy2(evaluation_config_file, evaluation_config_path) + shutil.copy2(os.path.join(self.cfg.evaluation.run.train_dir, 'results', 'hparams.yaml'), hparams_config_path) + def make_stage_command_groups(self, stage_cfg_path: Path) -> List[List[str]]: """ Make the command groups for current stage From 91742c108d844bcece515e9177ed4ee30e2c2732 Mon Sep 17 00:00:00 2001 From: Hongbin Liu Date: Thu, 24 Aug 2023 02:49:42 -0700 Subject: [PATCH 25/62] add llama support for auto configurator Signed-off-by: Hongbin Liu --- .../autoconfig/scripts/compare_throughput.py | 19 ++++++++-------- auto_configurator/autoconfig/search_config.py | 2 +- .../autoconfig/training_config.py | 22 ++++++++++++------- auto_configurator/autoconfig/utils.py | 14 ++++++------ 4 files changed, 32 insertions(+), 25 deletions(-) diff --git a/auto_configurator/autoconfig/scripts/compare_throughput.py b/auto_configurator/autoconfig/scripts/compare_throughput.py index c6c30a031b..d3b5b3e6c2 100644 --- a/auto_configurator/autoconfig/scripts/compare_throughput.py +++ b/auto_configurator/autoconfig/scripts/compare_throughput.py @@ -16,7 +16,8 @@ def main(cfg): settings_cfg = cfg.search_config.train_settings model_size = settings_cfg.model_size_in_b output_top_n = settings_cfg.output_top_n - nodes = cfg.get("nodes") + nodes = settings_cfg.num_nodes + #nodes = cfg.get("nodes") training_logs = os.path.join(settings_cfg.get("logs"), "training_logs") candidate_configs = os.path.join(settings_cfg.get("logs"), "candidate_configs") @@ -77,11 +78,11 @@ def main(cfg): model_name = candidate_cfg.get("run").get("name").split("_")[0] gbs = model_cfg.get("global_batch_size") enc_seq_len = ( - model_cfg.get("encoder_seq_length") if model_name in ("gpt3", "bert") else model_cfg.get("seq_length") + model_cfg.get("encoder_seq_length") if model_name in ("gpt3", "bert", "llama") else model_cfg.get("seq_length") ) dec_seq_len = data_cfg.get("seq_length_dec") - if model_name in ("gpt3", "bert"): + if model_name in ("gpt3", "bert", "llama"): hs = model_cfg.get("hidden_size") ffn_hs = None layers = model_cfg.get("num_layers") @@ -138,9 +139,9 @@ def main(cfg): ea.Reload() try: timing_list = ea.Scalars("train_step_timing") - if len(timing_list) <= 6: - continue - timing_list = [x.value for x in timing_list[5:]] + #if len(timing_list) <= 6: + # continue + timing_list = [x.value for x in timing_list[0:]] avg_global_step_time = round(sum(timing_list) / len(timing_list), 4) samples_per_s = round(gbs / avg_global_step_time, 2) m_tflops, m_tflops_gpu = calculate_tflops( @@ -184,14 +185,14 @@ def main(cfg): finally: continue - result.sort(key=lambda x: x[14]) + result.sort(key=lambda x: x[15]) print(f"Top {min(output_top_n, len(result))} configs sorted from fastest to slowest:") for i, res in enumerate(result): print(f"Config #{i+1}: {res[-1]} with {res[14]:.4f}s per global step.") if i + 1 == output_top_n: break - top_config = f"{model_name}_{model_size}b_{nodes}nodes_tp_{result[0][2]}_pp_{result[0][3]}_mbs_{result[0][4]}_act_ckpt_{result[0][5]}_num_mbs_act_{result[0][6]}_act_per_pipe_{result[0][7]}" + top_config = f"{model_name}_{model_size}b_{nodes}nodes_tp_{result[0][3]}_pp_{result[0][4]}_mbs_{result[0][5]}_act_ckpt_{result[0][6]}_num_mbs_act_{result[0][7]}_act_per_pipe_{result[0][8]}" print("\n==================================================") print(f"Optimal config: {top_config} with {result[0][14]:.4f}s per global step.") print(f"Saving config to {final_result_logs}/optimal_config_{model_size}b_{nodes}nodes.yaml.") @@ -223,7 +224,7 @@ def calculate_tflops( Bert Formula: Model FLOPs = 72BLsh^2 * ( 1 + (s/6h) + (v/12hL)) """ - if model_name == "gpt3": + if model_name in ["gpt3", "llama"]: # Model FLOPS calculation model_flops = ( (24 * gbs * enc_seq_len * hs * hs + 4 * gbs * enc_seq_len * enc_seq_len * hs) * (3 * layers) diff --git a/auto_configurator/autoconfig/search_config.py b/auto_configurator/autoconfig/search_config.py index 6870359ede..1f50a6d707 100644 --- a/auto_configurator/autoconfig/search_config.py +++ b/auto_configurator/autoconfig/search_config.py @@ -20,7 +20,7 @@ from autoconfig.inference_sweep import search_inference_config from autoconfig.training_config import search_training_config -SUPPORTED_MODELS = ["gpt3", "t5", "mt5", "bert"] +SUPPORTED_MODELS = ["gpt3", "t5", "mt5", "bert", "llama"] def search_config(cfg: omegaconf.dictconfig.DictConfig, hydra_args: Optional[str] = None): diff --git a/auto_configurator/autoconfig/training_config.py b/auto_configurator/autoconfig/training_config.py index 71f01f20e8..7940aecac1 100644 --- a/auto_configurator/autoconfig/training_config.py +++ b/auto_configurator/autoconfig/training_config.py @@ -69,12 +69,12 @@ def generate_grid_search_configs( act_layers = train_cfg.get("act_ckpt_layers") # 2 * num_layers is needed because of encoder/decoder architecture. - multiplier = 1 if model_name in ["gpt3", "bert"] else 2 + multiplier = 1 if model_name in ["gpt3", "bert", "llama"] else 2 seq_length = base_cfg["model"]["data"]["seq_length"] num_layers = ( base_cfg["model"]["num_layers"] - if model_name in ["gpt3", "bert"] + if model_name in ["gpt3", "bert", "llama"] else base_cfg["model"]["encoder"]["num_layers"] ) @@ -96,7 +96,7 @@ def generate_grid_search_configs( for mbs in mbs_list: num_gpus = base_cfg["trainer"]["num_nodes"] * base_cfg["trainer"]["devices"] gbs = base_cfg["model"]["global_batch_size"] - if model_name in ["gpt3", "bert"]: + if model_name in ["gpt3", "bert", "llama"]: att_heads = base_cfg["model"]["num_attention_heads"] num_layers = base_cfg["model"]["num_layers"] else: @@ -175,7 +175,7 @@ def _set_activations_checkpoint_params(tp, pp, num_layers, act_method, multiplie min_layers_per_pipe = 0 max_layers_per_pipe = num_layers interval_layers_per_pipe = act_multiple - if model_name in ["gpt3", "bert"] and pp > 2: # Interleaved pipeline scheduling. + if model_name in ["gpt3", "bert", "llama"] and pp > 2: # Interleaved pipeline scheduling. virtual_pipelines = num_layers // pp # TODO: verify that this is the best value. act_multiple = 1 max_micro_b = pp * (virtual_pipelines - 1) + (pp - 1) * 2 + 1 @@ -190,7 +190,7 @@ def _set_activations_checkpoint_params(tp, pp, num_layers, act_method, multiplie else: act_ckpt_layers = range(0, multiplier * num_layers // pp // virtual_pipelines + 1, act_multiple) - if pp > 1 and model_name in ["gpt3", "bert"]: + if pp > 1 and model_name in ["gpt3", "bert", "llama"]: # Num micro batches with partial act ckpt num_micro_batches_partial_act_ckpt = list(range(min_micro_b, max_micro_b + 1, interval_micro_b)) if num_micro_batches_partial_act_ckpt[0] == 0: @@ -304,6 +304,12 @@ def _tp_pp_mbs_grid_gpt3_80gb(model_size_in_b: float, valid_pp: List[int], seq_l mbs = [1, 2] min_model_parallel = 8 max_model_parallel = 32 + elif model_size_in_b <= 95: + tp = [4, 8] + pp = [x for x in valid_pp if 1 <= x <= 8] + mbs = [1, 2] + min_model_parallel = 8 + max_model_parallel = 64 elif seq_length == 8192: if model_size_in_b <= 1.0: tp = [1, 2] @@ -738,13 +744,13 @@ def _calculate_tp_pp_mbs_grid( mbs_sizes = train_cfg.get("micro_batch_sizes") gpu_memory_gb = train_cfg.get("gpu_memory_gb") - multiplier = 1 if model_name in ["gpt3", "bert"] else 2 - init_pp = [] if model_name == "gpt3" else [1] + multiplier = 1 if model_name in ["gpt3", "bert", "llama"] else 2 + init_pp = [] if model_name in ["gpt3", "llama"] else [1] valid_pp = init_pp + [ multiplier * x for x in range(1, num_layers + 1) if num_layers % x == 0 ] # Only divisors of num_layers are possible. - if model_name == "gpt3": + if model_name in ["gpt3", "llama"]: if gpu_memory_gb == 80: tp, pp, mbs, min_model_parallel, max_model_parallel = _tp_pp_mbs_grid_gpt3_80gb( model_size_in_b=model_size_in_b, valid_pp=valid_pp, seq_length=seq_length diff --git a/auto_configurator/autoconfig/utils.py b/auto_configurator/autoconfig/utils.py index 7e9b59460d..7a3125e0a8 100644 --- a/auto_configurator/autoconfig/utils.py +++ b/auto_configurator/autoconfig/utils.py @@ -45,7 +45,7 @@ def _calculate_model_size( :rtype: float :raises NotImplementedError: if the model name is not valid. """ - if model_name == "gpt3": + if model_name in ["gpt3", "llama"]: model_size = ( 12 * num_layers @@ -96,7 +96,7 @@ def calculate_model_size_params( :raises NotImplementedError: if the model name is not supported. """ ffn, kv = None, None # Only needed for some models. - if model_name == "gpt3": + if model_name in ["gpt3", "llama"]: if model_size_in_b < 0.25: hs, att_h, lr = 768, 12, 6e-4 elif model_size_in_b < 0.5: @@ -350,26 +350,26 @@ def modify_cfg( """ new_cfg = copy.deepcopy(base_cfg) if act is not None: - if model_name in ["gpt3", "bert"]: + if model_name in ["gpt3", "bert", "llama"]: new_cfg["model"]["activations_checkpoint_num_layers"] = act else: new_cfg["model"]["encoder"]["activations_checkpoint_num_layers"] = act // 2 new_cfg["model"]["decoder"]["activations_checkpoint_num_layers"] = act // 2 - if num_mbs_act is not None and model_name in ["gpt3", "bert"]: + if num_mbs_act is not None and model_name in ["gpt3", "bert", "llama"]: new_cfg["model"]["num_micro_batches_with_partial_activation_checkpoints"] = num_mbs_act - if act_per_pipe is not None and model_name in ["gpt3", "bert"]: + if act_per_pipe is not None and model_name in ["gpt3", "bert", "llama"]: new_cfg["model"]["activations_checkpoint_layers_per_pipeline"] = act_per_pipe - if virtual_pipelines is not None and model_name in ["gpt3", "bert"]: + if virtual_pipelines is not None and model_name in ["gpt3", "bert", "llama"]: new_cfg["model"]["virtual_pipeline_model_parallel_size"] = virtual_pipelines new_cfg["model"]["tensor_model_parallel_size"] = tp new_cfg["model"]["pipeline_model_parallel_size"] = pp new_cfg["model"]["micro_batch_size"] = mbs - if model_name in ["gpt3", "bert"]: + if model_name in ["gpt3", "bert", "llama"]: att_heads = new_cfg["model"]["num_attention_heads"] num_layers = new_cfg["model"]["num_layers"] else: From 389ba7a5ed06e3db9a2a354c4f8cd87b7e7f1811 Mon Sep 17 00:00:00 2001 From: Hongbin Liu Date: Thu, 24 Aug 2023 05:19:04 -0700 Subject: [PATCH 26/62] add llama2 training config Signed-off-by: Hongbin Liu --- launcher_scripts/conf/training/llama/13b.yaml | 77 +++++- launcher_scripts/conf/training/llama/30b.yaml | 78 +++++- launcher_scripts/conf/training/llama/65b.yaml | 78 +++++- launcher_scripts/conf/training/llama/7b.yaml | 138 +++++------ .../conf/training/llama/llama2_13b.yaml | 220 +++++++++++++++++ .../conf/training/llama/llama2_70b.yaml | 225 ++++++++++++++++++ .../conf/training/llama/llama2_7b.yaml | 223 +++++++++++++++++ 7 files changed, 948 insertions(+), 91 deletions(-) mode change 100755 => 100644 launcher_scripts/conf/training/llama/7b.yaml create mode 100644 launcher_scripts/conf/training/llama/llama2_13b.yaml create mode 100644 launcher_scripts/conf/training/llama/llama2_70b.yaml create mode 100644 launcher_scripts/conf/training/llama/llama2_7b.yaml diff --git a/launcher_scripts/conf/training/llama/13b.yaml b/launcher_scripts/conf/training/llama/13b.yaml index cf6f8ec8cc..e06835be27 100644 --- a/launcher_scripts/conf/training/llama/13b.yaml +++ b/launcher_scripts/conf/training/llama/13b.yaml @@ -10,16 +10,17 @@ trainer: precision: bf16 logger: false enable_checkpointing: false - replace_sampler_ddp: false + use_distributed_sampler: false max_epochs: null max_steps: 300000 max_time: '5:23:30:00' log_every_n_steps: 10 val_check_interval: 2000 - limit_val_batches: 50 + limit_val_batches: 32 limit_test_batches: 50 accumulate_grad_batches: 1 gradient_clip_val: 1.0 + num_sanity_val_steps: 0 exp_manager: explicit_log_dir: ${training.run.results_dir}/results exp_dir: null @@ -30,7 +31,7 @@ exp_manager: name: ${training.run.name} resume_if_exists: true resume_ignore_no_checkpoint: true - create_checkpoint_callback: false + create_checkpoint_callback: true checkpoint_callback_params: monitor: val_loss save_top_k: 10 @@ -45,8 +46,9 @@ exp_manager: sync_cuda: true buffer_size: 5 model: + mcore_gpt: true micro_batch_size: 2 - global_batch_size: 2048 + global_batch_size: 128 rampup_batch_size: null tensor_model_parallel_size: 4 pipeline_model_parallel_size: 1 @@ -114,7 +116,7 @@ model: num_micro_batches_with_partial_activation_checkpoints: 0 activations_checkpoint_layers_per_pipeline: 0 sequence_parallel: false - transformer_engine: false + transformer_engine: true fp8: false fp8_e4m3: false fp8_hybrid: false @@ -151,7 +153,68 @@ model: eod_mask_loss: false index_mapping_dir: null data_prefix: - - 0.5 + - .0333 - ${data_dir}/my-llama_00_text_document - - 0.5 + - .0333 - ${data_dir}/my-llama_01_text_document + - .0333 + - ${data_dir}/my-llama_00_text_document + - .0333 + - ${data_dir}/my-llama_01_text_document + - .0333 + - ${data_dir}/my-llama_02_text_document + - .0333 + - ${data_dir}/my-llama_03_text_document + - .0333 + - ${data_dir}/my-llama_04_text_document + - .0333 + - ${data_dir}/my-llama_05_text_document + - .0333 + - ${data_dir}/my-llama_06_text_document + - .0333 + - ${data_dir}/my-llama_07_text_document + - .0333 + - ${data_dir}/my-llama_08_text_document + - .0333 + - ${data_dir}/my-llama_09_text_document + - .0333 + - ${data_dir}/my-llama_10_text_document + - .0333 + - ${data_dir}/my-llama_11_text_document + - .0333 + - ${data_dir}/my-llama_12_text_document + - .0333 + - ${data_dir}/my-llama_13_text_document + - .0333 + - ${data_dir}/my-llama_14_text_document + - .0333 + - ${data_dir}/my-llama_15_text_document + - .0333 + - ${data_dir}/my-llama_16_text_document + - .0333 + - ${data_dir}/my-llama_17_text_document + - .0333 + - ${data_dir}/my-llama_18_text_document + - .0333 + - ${data_dir}/my-llama_19_text_document + - .0333 + - ${data_dir}/my-llama_20_text_document + - .0333 + - ${data_dir}/my-llama_21_text_document + - .0333 + - ${data_dir}/my-llama_22_text_document + - .0333 + - ${data_dir}/my-llama_23_text_document + - .0333 + - ${data_dir}/my-llama_24_text_document + - .0333 + - ${data_dir}/my-llama_25_text_document + - .0333 + - ${data_dir}/my-llama_26_text_document + - .0333 + - ${data_dir}/my-llama_27_text_document + - .0333 + - ${data_dir}/my-llama_28_text_document + - .0334 + - ${data_dir}/my-llama_29_text_document + diff --git a/launcher_scripts/conf/training/llama/30b.yaml b/launcher_scripts/conf/training/llama/30b.yaml index 33ed5054c8..ebdfa06f02 100644 --- a/launcher_scripts/conf/training/llama/30b.yaml +++ b/launcher_scripts/conf/training/llama/30b.yaml @@ -10,13 +10,13 @@ trainer: precision: bf16 logger: false enable_checkpointing: false - replace_sampler_ddp: false + use_distributed_sampler: false max_epochs: null max_steps: 300000 max_time: '19:23:30:00' log_every_n_steps: 10 val_check_interval: 2000 - limit_val_batches: 50 + limit_val_batches: 32 limit_test_batches: 50 accumulate_grad_batches: 1 gradient_clip_val: 1.0 @@ -44,6 +44,7 @@ exp_manager: sync_cuda: true buffer_size: 5 model: + mcore_gpt: true micro_batch_size: 1 global_batch_size: 2048 rampup_batch_size: null @@ -113,7 +114,7 @@ model: num_micro_batches_with_partial_activation_checkpoints: 2 activations_checkpoint_layers_per_pipeline: 32 sequence_parallel: false - transformer_engine: false + transformer_engine: true fp8: false fp8_e4m3: false fp8_hybrid: false @@ -125,14 +126,14 @@ model: ub_tp_comm_overlap: false use_flash_attention: false optim: - name: fused_adam + name: distributed_fused_adam lr: 0.0001 weight_decay: 0.1 betas: - 0.9 - 0.95 - #bucket_cap_mb: 125 - #overlap_grad_sync: false + bucket_cap_mb: 125 + overlap_grad_sync: false sched: name: CosineAnnealing warmup_steps: 107 @@ -150,7 +151,68 @@ model: eod_mask_loss: false index_mapping_dir: null data_prefix: - - .5 + - .0333 - ${data_dir}/my-llama_00_text_document - - .5 + - .0333 - ${data_dir}/my-llama_01_text_document + - .0333 + - ${data_dir}/my-llama_00_text_document + - .0333 + - ${data_dir}/my-llama_01_text_document + - .0333 + - ${data_dir}/my-llama_02_text_document + - .0333 + - ${data_dir}/my-llama_03_text_document + - .0333 + - ${data_dir}/my-llama_04_text_document + - .0333 + - ${data_dir}/my-llama_05_text_document + - .0333 + - ${data_dir}/my-llama_06_text_document + - .0333 + - ${data_dir}/my-llama_07_text_document + - .0333 + - ${data_dir}/my-llama_08_text_document + - .0333 + - ${data_dir}/my-llama_09_text_document + - .0333 + - ${data_dir}/my-llama_10_text_document + - .0333 + - ${data_dir}/my-llama_11_text_document + - .0333 + - ${data_dir}/my-llama_12_text_document + - .0333 + - ${data_dir}/my-llama_13_text_document + - .0333 + - ${data_dir}/my-llama_14_text_document + - .0333 + - ${data_dir}/my-llama_15_text_document + - .0333 + - ${data_dir}/my-llama_16_text_document + - .0333 + - ${data_dir}/my-llama_17_text_document + - .0333 + - ${data_dir}/my-llama_18_text_document + - .0333 + - ${data_dir}/my-llama_19_text_document + - .0333 + - ${data_dir}/my-llama_20_text_document + - .0333 + - ${data_dir}/my-llama_21_text_document + - .0333 + - ${data_dir}/my-llama_22_text_document + - .0333 + - ${data_dir}/my-llama_23_text_document + - .0333 + - ${data_dir}/my-llama_24_text_document + - .0333 + - ${data_dir}/my-llama_25_text_document + - .0333 + - ${data_dir}/my-llama_26_text_document + - .0333 + - ${data_dir}/my-llama_27_text_document + - .0333 + - ${data_dir}/my-llama_28_text_document + - .0334 + - ${data_dir}/my-llama_29_text_document + diff --git a/launcher_scripts/conf/training/llama/65b.yaml b/launcher_scripts/conf/training/llama/65b.yaml index 464af39c09..0f65d40071 100644 --- a/launcher_scripts/conf/training/llama/65b.yaml +++ b/launcher_scripts/conf/training/llama/65b.yaml @@ -10,16 +10,17 @@ trainer: precision: bf16 logger: false enable_checkpointing: false - replace_sampler_ddp: false + use_distributed_sampler: false max_epochs: null max_steps: 300000 max_time: '19:23:30:00' log_every_n_steps: 10 val_check_interval: 2000 - limit_val_batches: 50 + limit_val_batches: 32 limit_test_batches: 50 accumulate_grad_batches: 1 gradient_clip_val: 1.0 + num_sanity_val_steps: 0 exp_manager: explicit_log_dir: ${training.run.results_dir}/results exp_dir: null @@ -44,6 +45,7 @@ exp_manager: sync_cuda: true buffer_size: 5 model: + mcore_gpt: true micro_batch_size: 1 global_batch_size: 2048 rampup_batch_size: null @@ -113,7 +115,7 @@ model: num_micro_batches_with_partial_activation_checkpoints: 80 activations_checkpoint_layers_per_pipeline: 0 sequence_parallel: false - transformer_engine: false + transformer_engine: true fp8: false fp8_e4m3: false fp8_hybrid: false @@ -125,14 +127,14 @@ model: ub_tp_comm_overlap: false use_flash_attention: false optim: - name: fused_adam + name: distributed_fused_adam lr: 0.0001 weight_decay: 0.1 betas: - 0.9 - 0.95 - #bucket_cap_mb: 125 - #overlap_grad_sync: false + bucket_cap_mb: 125 + overlap_grad_sync: false sched: name: CosineAnnealing warmup_steps: 107 @@ -150,8 +152,68 @@ model: eod_mask_loss: false index_mapping_dir: null data_prefix: - - .5 + - .0333 - ${data_dir}/my-llama_00_text_document - - .5 + - .0333 - ${data_dir}/my-llama_01_text_document + - .0333 + - ${data_dir}/my-llama_00_text_document + - .0333 + - ${data_dir}/my-llama_01_text_document + - .0333 + - ${data_dir}/my-llama_02_text_document + - .0333 + - ${data_dir}/my-llama_03_text_document + - .0333 + - ${data_dir}/my-llama_04_text_document + - .0333 + - ${data_dir}/my-llama_05_text_document + - .0333 + - ${data_dir}/my-llama_06_text_document + - .0333 + - ${data_dir}/my-llama_07_text_document + - .0333 + - ${data_dir}/my-llama_08_text_document + - .0333 + - ${data_dir}/my-llama_09_text_document + - .0333 + - ${data_dir}/my-llama_10_text_document + - .0333 + - ${data_dir}/my-llama_11_text_document + - .0333 + - ${data_dir}/my-llama_12_text_document + - .0333 + - ${data_dir}/my-llama_13_text_document + - .0333 + - ${data_dir}/my-llama_14_text_document + - .0333 + - ${data_dir}/my-llama_15_text_document + - .0333 + - ${data_dir}/my-llama_16_text_document + - .0333 + - ${data_dir}/my-llama_17_text_document + - .0333 + - ${data_dir}/my-llama_18_text_document + - .0333 + - ${data_dir}/my-llama_19_text_document + - .0333 + - ${data_dir}/my-llama_20_text_document + - .0333 + - ${data_dir}/my-llama_21_text_document + - .0333 + - ${data_dir}/my-llama_22_text_document + - .0333 + - ${data_dir}/my-llama_23_text_document + - .0333 + - ${data_dir}/my-llama_24_text_document + - .0333 + - ${data_dir}/my-llama_25_text_document + - .0333 + - ${data_dir}/my-llama_26_text_document + - .0333 + - ${data_dir}/my-llama_27_text_document + - .0333 + - ${data_dir}/my-llama_28_text_document + - .0334 + - ${data_dir}/my-llama_29_text_document diff --git a/launcher_scripts/conf/training/llama/7b.yaml b/launcher_scripts/conf/training/llama/7b.yaml old mode 100755 new mode 100644 index 96cb4790c0..cc1bb32c15 --- a/launcher_scripts/conf/training/llama/7b.yaml +++ b/launcher_scripts/conf/training/llama/7b.yaml @@ -4,22 +4,23 @@ run: time_limit: "0-04:00:00" dependency: "singleton" trainer: - num_nodes: 4 + num_nodes: 2 devices: 8 accelerator: gpu precision: bf16 logger: False # logger provided by exp_manager enable_checkpointing: False - replace_sampler_ddp: False + use_distributed_sampler: False max_epochs: null max_steps: 300000 # consumed_samples = global_step * global_batch_size max_time: "05:23:30:00" # days:hours:minutes:seconds log_every_n_steps: 10 val_check_interval: 2000 - limit_val_batches: 50 + limit_val_batches: 32 limit_test_batches: 50 accumulate_grad_batches: 1 gradient_clip_val: 1.0 + num_sanity_val_steps: 0 exp_manager: explicit_log_dir: ${training.run.results_dir}/results exp_dir: null @@ -45,8 +46,9 @@ exp_manager: buffer_size: 5 model: + mcore_gpt: true micro_batch_size: 2 - global_batch_size: 2048 + global_batch_size: 128 rampup_batch_size: null tensor_model_parallel_size: 2 pipeline_model_parallel_size: 1 @@ -117,7 +119,7 @@ model: ## Transformer Engine # fp8 training is currently not supported in the improved models - transformer_engine: False + transformer_engine: True fp8: False # enables fp8 in TransformerLayer forward fp8_e4m3: False # sets fp8_format = recipe.Format.E4M3 fp8_hybrid: False # sets fp8_format = recipe.Format.HYBRID @@ -127,7 +129,7 @@ model: fp8_amax_compute_algo: max # 'most_recent' or 'max'. Algorithm for computing amax from history use_emha: False ub_tp_comm_overlap: False - use_flash_attention: false + use_flash_attention: True optim: name: distributed_fused_adam lr: 1e-4 @@ -154,68 +156,68 @@ model: eod_mask_loss: false index_mapping_dir: null data_prefix: - - .5 + - .0333 - ${data_dir}/my-llama_00_text_document - - .5 + - .0333 - ${data_dir}/my-llama_01_text_document - # - .0333 - # - ${data_dir}/my-gpt3_00_text_document - # - .0333 - # - ${data_dir}/my-gpt3_01_text_document - # - .0333 - # - ${data_dir}/my-gpt3_02_text_document - # - .0333 - # - ${data_dir}/my-gpt3_03_text_document - # - .0333 - # - ${data_dir}/my-gpt3_04_text_document - # - .0333 - # - ${data_dir}/my-gpt3_05_text_document - # - .0333 - # - ${data_dir}/my-gpt3_06_text_document - # - .0333 - # - ${data_dir}/my-gpt3_07_text_document - # - .0333 - # - ${data_dir}/my-gpt3_08_text_document - # - .0333 - # - ${data_dir}/my-gpt3_09_text_document - # - .0333 - # - ${data_dir}/my-gpt3_10_text_document - # - .0333 - # - ${data_dir}/my-gpt3_11_text_document - # - .0333 - # - ${data_dir}/my-gpt3_12_text_document - # - .0333 - # - ${data_dir}/my-gpt3_13_text_document - # - .0333 - # - ${data_dir}/my-gpt3_14_text_document - # - .0333 - # - ${data_dir}/my-gpt3_15_text_document - # - .0333 - # - ${data_dir}/my-gpt3_16_text_document - # - .0333 - # - ${data_dir}/my-gpt3_17_text_document - # - .0333 - # - ${data_dir}/my-gpt3_18_text_document - # - .0333 - # - ${data_dir}/my-gpt3_19_text_document - # - .0333 - # - ${data_dir}/my-gpt3_20_text_document - # - .0333 - # - ${data_dir}/my-gpt3_21_text_document - # - .0333 - # - ${data_dir}/my-gpt3_22_text_document - # - .0333 - # - ${data_dir}/my-gpt3_23_text_document - # - .0333 - # - ${data_dir}/my-gpt3_24_text_document - # - .0333 - # - ${data_dir}/my-gpt3_25_text_document - # - .0333 - # - ${data_dir}/my-gpt3_26_text_document - # - .0333 - # - ${data_dir}/my-gpt3_27_text_document - # - .0333 - # - ${data_dir}/my-gpt3_28_text_document - # - .0334 - # - ${data_dir}/my-gpt3_29_text_document + - .0333 + - ${data_dir}/my-llama_00_text_document + - .0333 + - ${data_dir}/my-llama_01_text_document + - .0333 + - ${data_dir}/my-llama_02_text_document + - .0333 + - ${data_dir}/my-llama_03_text_document + - .0333 + - ${data_dir}/my-llama_04_text_document + - .0333 + - ${data_dir}/my-llama_05_text_document + - .0333 + - ${data_dir}/my-llama_06_text_document + - .0333 + - ${data_dir}/my-llama_07_text_document + - .0333 + - ${data_dir}/my-llama_08_text_document + - .0333 + - ${data_dir}/my-llama_09_text_document + - .0333 + - ${data_dir}/my-llama_10_text_document + - .0333 + - ${data_dir}/my-llama_11_text_document + - .0333 + - ${data_dir}/my-llama_12_text_document + - .0333 + - ${data_dir}/my-llama_13_text_document + - .0333 + - ${data_dir}/my-llama_14_text_document + - .0333 + - ${data_dir}/my-llama_15_text_document + - .0333 + - ${data_dir}/my-llama_16_text_document + - .0333 + - ${data_dir}/my-llama_17_text_document + - .0333 + - ${data_dir}/my-llama_18_text_document + - .0333 + - ${data_dir}/my-llama_19_text_document + - .0333 + - ${data_dir}/my-llama_20_text_document + - .0333 + - ${data_dir}/my-llama_21_text_document + - .0333 + - ${data_dir}/my-llama_22_text_document + - .0333 + - ${data_dir}/my-llama_23_text_document + - .0333 + - ${data_dir}/my-llama_24_text_document + - .0333 + - ${data_dir}/my-llama_25_text_document + - .0333 + - ${data_dir}/my-llama_26_text_document + - .0333 + - ${data_dir}/my-llama_27_text_document + - .0333 + - ${data_dir}/my-llama_28_text_document + - .0334 + - ${data_dir}/my-llama_29_text_document diff --git a/launcher_scripts/conf/training/llama/llama2_13b.yaml b/launcher_scripts/conf/training/llama/llama2_13b.yaml new file mode 100644 index 0000000000..a77584a33d --- /dev/null +++ b/launcher_scripts/conf/training/llama/llama2_13b.yaml @@ -0,0 +1,220 @@ +run: + name: llama2_13b + results_dir: ${base_results_dir}/${.name} + time_limit: 0-01:00:00 + dependency: singleton +trainer: + num_nodes: 4 + devices: 8 + accelerator: gpu + precision: bf16 + logger: false + enable_checkpointing: false + use_distributed_sampler: false + max_epochs: null + max_steps: 300000 + max_time: '5:23:30:00' + log_every_n_steps: 10 + val_check_interval: 2000 + limit_val_batches: 32 + limit_test_batches: 50 + accumulate_grad_batches: 1 + gradient_clip_val: 1.0 + num_sanity_val_steps: 0 +exp_manager: + explicit_log_dir: ${training.run.results_dir}/results + exp_dir: null + name: megatron_llama + create_wandb_logger: true + wandb_logger_kwargs: + project: nemo_llama_pretrain + name: ${training.run.name} + resume_if_exists: false + resume_ignore_no_checkpoint: true + create_checkpoint_callback: True + checkpoint_callback_params: + monitor: val_loss + save_top_k: 10 + mode: min + always_save_nemo: false + save_nemo_on_train_end: true + filename: megatron_llama--{val_loss:.2f}-{step}-{consumed_samples} + model_parallel_size: ${multiply:${training.model.tensor_model_parallel_size}, + ${training.model.pipeline_model_parallel_size}} + log_step_timing: true + step_timing_kwargs: + sync_cuda: true + buffer_size: 5 +model: + mcore_gpt: true + micro_batch_size: 2 + global_batch_size: 128 + rampup_batch_size: null + tensor_model_parallel_size: 4 + pipeline_model_parallel_size: 1 + virtual_pipeline_model_parallel_size: null + encoder_seq_length: 4096 + max_position_embeddings: 4096 + num_layers: 40 + hidden_size: 5120 + ffn_hidden_size: 13824 + num_attention_heads: 40 + init_method_std: 0.01 + use_scaled_init_method: true + hidden_dropout: 0.0 + attention_dropout: 0.0 + ffn_dropout: 0.0 + kv_channels: null + apply_query_key_layer_scaling: true + normalization: rmsnorm + layernorm_epsilon: 1.0e-05 + do_layer_norm_weight_decay: false + make_vocab_size_divisible_by: 128 + pre_process: true + post_process: true + persist_layer_norm: true + bias: false + activation: fast-swiglu + headscale: false + transformer_block_type: pre_ln + openai_gelu: false + normalize_attention_scores: true + position_embedding_type: rope + rotary_percentage: 1.0 + attention_type: multihead + share_embeddings_and_output_weights: false + tokenizer: + library: sentencepiece + type: null + model: ${data_dir}/llama/llama_tokenizer.model + delimiter: null + vocab_file: null + merge_file: null + sentencepiece_legacy: false + native_amp_init_scale: 4294967296 + native_amp_growth_interval: 1000 + hysteresis: 2 + fp32_residual_connection: false + fp16_lm_cross_entropy: false + megatron_amp_O2: true + grad_allreduce_chunk_size_mb: 125 + grad_div_ar_fusion: true + gradient_accumulation_fusion: true + bias_activation_fusion: false + bias_dropout_add_fusion: false + masked_softmax_fusion: true + seed: 1234 + resume_from_checkpoint: null + use_cpu_initialization: false + onnx_safe: false + apex_transformer_log_level: 30 + gradient_as_bucket_view: true + sync_batch_comm: false + activations_checkpoint_granularity: selective + activations_checkpoint_method: block + activations_checkpoint_num_layers: 0 + num_micro_batches_with_partial_activation_checkpoints: 0 + activations_checkpoint_layers_per_pipeline: 0 + sequence_parallel: true + transformer_engine: true + fp8: false + fp8_e4m3: false + fp8_hybrid: false + fp8_margin: 0 + fp8_interval: 1 + fp8_amax_history_len: 1 + fp8_amax_compute_algo: most_recent + use_emha: false + ub_tp_comm_overlap: false + use_flash_attention: true + optim: + name: distributed_fused_adam + lr: 0.0001 + weight_decay: 0.1 + betas: + - 0.9 + - 0.95 + bucket_cap_mb: 125 + overlap_grad_sync: false + sched: + name: CosineAnnealing + warmup_steps: 107 + constant_steps: 11873 + min_lr: 1.0e-05 + data: + data_impl: mmap + splits_string: 99990,8,2 + seq_length: 4096 + skip_warmup: true + num_workers: 2 + dataloader_type: single + reset_position_ids: false + reset_attention_mask: false + eod_mask_loss: false + index_mapping_dir: null + data_prefix: + - .0333 + - ${data_dir}/my-llama_00_text_document + - .0333 + - ${data_dir}/my-llama_01_text_document + - .0333 + - ${data_dir}/my-llama_00_text_document + - .0333 + - ${data_dir}/my-llama_01_text_document + - .0333 + - ${data_dir}/my-llama_02_text_document + - .0333 + - ${data_dir}/my-llama_03_text_document + - .0333 + - ${data_dir}/my-llama_04_text_document + - .0333 + - ${data_dir}/my-llama_05_text_document + - .0333 + - ${data_dir}/my-llama_06_text_document + - .0333 + - ${data_dir}/my-llama_07_text_document + - .0333 + - ${data_dir}/my-llama_08_text_document + - .0333 + - ${data_dir}/my-llama_09_text_document + - .0333 + - ${data_dir}/my-llama_10_text_document + - .0333 + - ${data_dir}/my-llama_11_text_document + - .0333 + - ${data_dir}/my-llama_12_text_document + - .0333 + - ${data_dir}/my-llama_13_text_document + - .0333 + - ${data_dir}/my-llama_14_text_document + - .0333 + - ${data_dir}/my-llama_15_text_document + - .0333 + - ${data_dir}/my-llama_16_text_document + - .0333 + - ${data_dir}/my-llama_17_text_document + - .0333 + - ${data_dir}/my-llama_18_text_document + - .0333 + - ${data_dir}/my-llama_19_text_document + - .0333 + - ${data_dir}/my-llama_20_text_document + - .0333 + - ${data_dir}/my-llama_21_text_document + - .0333 + - ${data_dir}/my-llama_22_text_document + - .0333 + - ${data_dir}/my-llama_23_text_document + - .0333 + - ${data_dir}/my-llama_24_text_document + - .0333 + - ${data_dir}/my-llama_25_text_document + - .0333 + - ${data_dir}/my-llama_26_text_document + - .0333 + - ${data_dir}/my-llama_27_text_document + - .0333 + - ${data_dir}/my-llama_28_text_document + - .0334 + - ${data_dir}/my-llama_29_text_document + diff --git a/launcher_scripts/conf/training/llama/llama2_70b.yaml b/launcher_scripts/conf/training/llama/llama2_70b.yaml new file mode 100644 index 0000000000..7697b36e0f --- /dev/null +++ b/launcher_scripts/conf/training/llama/llama2_70b.yaml @@ -0,0 +1,225 @@ +run: + name: llama2_70b + results_dir: ${base_results_dir}/${.name} + time_limit: 0-01:00:00 + dependency: singleton +trainer: + num_nodes: 8 + devices: 8 + accelerator: gpu + precision: bf16 + logger: false + enable_checkpointing: false + use_distributed_sampler: False + max_epochs: null + max_steps: 300000 + max_time: '19:23:30:00' + log_every_n_steps: 10 + val_check_interval: 2000 + limit_val_batches: 32 + limit_test_batches: 50 + accumulate_grad_batches: 1 + gradient_clip_val: 1.0 + num_sanity_val_steps: 0 +exp_manager: + explicit_log_dir: ${training.run.results_dir}/results + exp_dir: null + name: megatron_llama + create_wandb_logger: true + wandb_logger_kwargs: + project: nemo_llama_pretrain + name: ${training.run.name} + resume_if_exists: false + resume_ignore_no_checkpoint: true + create_checkpoint_callback: True + checkpoint_callback_params: + monitor: val_loss + save_top_k: 10 + mode: min + always_save_nemo: false + save_nemo_on_train_end: true + filename: megatron_llama--{val_loss:.2f}-{step}-{consumed_samples} + model_parallel_size: ${multiply:${training.model.tensor_model_parallel_size}, ${training.model.pipeline_model_parallel_size}} + log_step_timing: true + step_timing_kwargs: + sync_cuda: true + buffer_size: 5 +model: + mcore_gpt: true + micro_batch_size: 1 + global_batch_size: 128 + rampup_batch_size: null + tensor_model_parallel_size: 4 + pipeline_model_parallel_size: 4 + virtual_pipeline_model_parallel_size: null + encoder_seq_length: 4096 + max_position_embeddings: 4096 + num_layers: 80 + hidden_size: 8192 + ffn_hidden_size: 28672 + num_attention_heads: 64 + num_query_groups: 8 + init_method_std: 0.008944 + use_scaled_init_method: true + hidden_dropout: 0.0 + attention_dropout: 0.0 + ffn_dropout: 0.0 + kv_channels: null + apply_query_key_layer_scaling: true + normalization: rmsnorm + layernorm_epsilon: 1.0e-05 + do_layer_norm_weight_decay: false + make_vocab_size_divisible_by: 128 + pre_process: true + post_process: true + persist_layer_norm: true + bias: false + activation: fast-swiglu + headscale: false + transformer_block_type: pre_ln + openai_gelu: false + normalize_attention_scores: true + position_embedding_type: rope + rotary_percentage: 1.0 + attention_type: multihead + share_embeddings_and_output_weights: false + tokenizer: + library: sentencepiece + type: null + model: ${data_dir}/llama/llama_tokenizer.model + delimiter: null + vocab_file: null + merge_file: null + sentencepiece_legacy: false + native_amp_init_scale: 4294967296 + native_amp_growth_interval: 1000 + hysteresis: 2 + fp32_residual_connection: false + fp16_lm_cross_entropy: false + megatron_amp_O2: true + grad_allreduce_chunk_size_mb: 125 + grad_div_ar_fusion: true + gradient_accumulation_fusion: true + bias_activation_fusion: false + bias_dropout_add_fusion: false + masked_softmax_fusion: true + seed: 1234 + resume_from_checkpoint: null + use_cpu_initialization: false + onnx_safe: false + apex_transformer_log_level: 30 + gradient_as_bucket_view: true + sync_batch_comm: false + activations_checkpoint_granularity: selective + activations_checkpoint_method: block + activations_checkpoint_num_layers: 1 + num_micro_batches_with_partial_activation_checkpoints: 0 + activations_checkpoint_layers_per_pipeline: 0 + sequence_parallel: true + transformer_engine: true + fp8: false + fp8_e4m3: false + fp8_hybrid: false + fp8_margin: 0 + fp8_interval: 1 + fp8_amax_history_len: 1 + fp8_amax_compute_algo: most_recent + use_emha: false + ub_tp_comm_overlap: false + use_flash_attention: true + overlap_p2p_comm: false + batch_p2p_comm: true + gc_interval: 100 + optim: + name: distributed_fused_adam + lr: 0.00015 + weight_decay: 0.1 + betas: + - 0.9 + - 0.95 + bucket_cap_mb: 125 + overlap_grad_sync: true + overlap_param_sync: true + contiguous_grad_buffer: true + sched: + name: CosineAnnealing + warmup_steps: 2000 + constant_steps: 11873 + min_lr: 1.0e-05 + data: + data_impl: mmap + splits_string: 99990,8,2 + seq_length: 4096 + skip_warmup: true + num_workers: 2 + dataloader_type: single + reset_position_ids: false + reset_attention_mask: false + eod_mask_loss: false + index_mapping_dir: null + data_prefix: + - .0333 + - ${data_dir}/my-llama_00_text_document + - .0333 + - ${data_dir}/my-llama_01_text_document + - .0333 + - ${data_dir}/my-llama_00_text_document + - .0333 + - ${data_dir}/my-llama_01_text_document + - .0333 + - ${data_dir}/my-llama_02_text_document + - .0333 + - ${data_dir}/my-llama_03_text_document + - .0333 + - ${data_dir}/my-llama_04_text_document + - .0333 + - ${data_dir}/my-llama_05_text_document + - .0333 + - ${data_dir}/my-llama_06_text_document + - .0333 + - ${data_dir}/my-llama_07_text_document + - .0333 + - ${data_dir}/my-llama_08_text_document + - .0333 + - ${data_dir}/my-llama_09_text_document + - .0333 + - ${data_dir}/my-llama_10_text_document + - .0333 + - ${data_dir}/my-llama_11_text_document + - .0333 + - ${data_dir}/my-llama_12_text_document + - .0333 + - ${data_dir}/my-llama_13_text_document + - .0333 + - ${data_dir}/my-llama_14_text_document + - .0333 + - ${data_dir}/my-llama_15_text_document + - .0333 + - ${data_dir}/my-llama_16_text_document + - .0333 + - ${data_dir}/my-llama_17_text_document + - .0333 + - ${data_dir}/my-llama_18_text_document + - .0333 + - ${data_dir}/my-llama_19_text_document + - .0333 + - ${data_dir}/my-llama_20_text_document + - .0333 + - ${data_dir}/my-llama_21_text_document + - .0333 + - ${data_dir}/my-llama_22_text_document + - .0333 + - ${data_dir}/my-llama_23_text_document + - .0333 + - ${data_dir}/my-llama_24_text_document + - .0333 + - ${data_dir}/my-llama_25_text_document + - .0333 + - ${data_dir}/my-llama_26_text_document + - .0333 + - ${data_dir}/my-llama_27_text_document + - .0333 + - ${data_dir}/my-llama_28_text_document + - .0334 + - ${data_dir}/my-llama_29_text_document + diff --git a/launcher_scripts/conf/training/llama/llama2_7b.yaml b/launcher_scripts/conf/training/llama/llama2_7b.yaml new file mode 100644 index 0000000000..dcc2887bcf --- /dev/null +++ b/launcher_scripts/conf/training/llama/llama2_7b.yaml @@ -0,0 +1,223 @@ +run: + name: llama2_7b + results_dir: ${base_results_dir}/${.name} + time_limit: "0-01:00:00" + dependency: "singleton" +trainer: + num_nodes: 2 + devices: 8 + accelerator: gpu + precision: bf16 + logger: False # logger provided by exp_manager + enable_checkpointing: False + use_distributed_sampler: False + max_epochs: null + max_steps: 300000 # consumed_samples = global_step * global_batch_size + max_time: "05:23:30:00" # days:hours:minutes:seconds + log_every_n_steps: 10 + val_check_interval: 2000 + limit_val_batches: 32 + limit_test_batches: 50 + accumulate_grad_batches: 1 + gradient_clip_val: 1.0 + num_sanity_val_steps: 0 +exp_manager: + explicit_log_dir: ${training.run.results_dir}/results + exp_dir: null + name: megatron_llama + create_wandb_logger: true + wandb_logger_kwargs: + project: nemo_llama_pretrain + name: ${training.run.name} + resume_if_exists: false + resume_ignore_no_checkpoint: true + create_checkpoint_callback: True + checkpoint_callback_params: + monitor: val_loss + save_top_k: 10 + mode: min + always_save_nemo: False # saves nemo file during validation, not implemented for model parallel + save_nemo_on_train_end: False # not recommended when training large models on clusters with short time limits + filename: 'megatron_llama--{val_loss:.2f}-{step}-{consumed_samples}' + model_parallel_size: ${multiply:${training.model.tensor_model_parallel_size}, ${training.model.pipeline_model_parallel_size}} + log_step_timing: True + step_timing_kwargs: + sync_cuda: True + buffer_size: 5 + +model: + mcore_gpt: true + micro_batch_size: 2 + global_batch_size: 128 + rampup_batch_size: null + tensor_model_parallel_size: 2 + pipeline_model_parallel_size: 1 + virtual_pipeline_model_parallel_size: null + encoder_seq_length: 4096 + max_position_embeddings: 4096 + num_layers: 32 + hidden_size: 4096 + ffn_hidden_size: 11008 + num_attention_heads: 32 + init_method_std: 0.01 + use_scaled_init_method: true + hidden_dropout: 0.0 + attention_dropout: 0.0 + ffn_dropout: 0.0 + kv_channels: null + apply_query_key_layer_scaling: true + normalization: rmsnorm + layernorm_epsilon: 1.0e-05 + do_layer_norm_weight_decay: false + make_vocab_size_divisible_by: 128 + pre_process: true + post_process: true + persist_layer_norm: true + bias: false + activation: fast-swiglu + headscale: false + transformer_block_type: pre_ln + openai_gelu: false + normalize_attention_scores: true + position_embedding_type: rope + rotary_percentage: 1.0 + attention_type: multihead + share_embeddings_and_output_weights: false + tokenizer: + library: 'sentencepiece' + type: null + model: ${data_dir}/llama/llama_tokenizer.model + delimiter: null + vocab_file: null + merge_file: null + sentencepiece_legacy: False + native_amp_init_scale: 4294967296 + native_amp_growth_interval: 1000 + hysteresis: 2 + fp32_residual_connection: false + fp16_lm_cross_entropy: false + megatron_amp_O2: true + grad_allreduce_chunk_size_mb: 125 + grad_div_ar_fusion: true + gradient_accumulation_fusion: true + bias_activation_fusion: false + bias_dropout_add_fusion: false + masked_softmax_fusion: true + seed: 1234 + resume_from_checkpoint: null + use_cpu_initialization: false + onnx_safe: false + apex_transformer_log_level: 30 + gradient_as_bucket_view: true + sync_batch_comm: false + activations_checkpoint_granularity: selective + activations_checkpoint_method: block + activations_checkpoint_num_layers: 0 + num_micro_batches_with_partial_activation_checkpoints: null + activations_checkpoint_layers_per_pipeline: null + sequence_parallel: true # does not support sequence parallel + + ## Transformer Engine + # fp8 training is currently not supported in the improved models + transformer_engine: true + fp8: False # enables fp8 in TransformerLayer forward + fp8_e4m3: False # sets fp8_format = recipe.Format.E4M3 + fp8_hybrid: False # sets fp8_format = recipe.Format.HYBRID + fp8_margin: 0 # scaling margin + fp8_interval: 1 # scaling update interval + fp8_amax_history_len: 1024 # Number of steps for which amax history is recorded per tensor + fp8_amax_compute_algo: max # 'most_recent' or 'max'. Algorithm for computing amax from history + use_emha: False + ub_tp_comm_overlap: False + use_flash_attention: true + optim: + name: distributed_fused_adam + lr: 1e-4 + weight_decay: 0.1 + betas: + - 0.9 + - 0.95 + bucket_cap_mb: 125 + overlap_grad_sync: False + sched: + name: CosineAnnealing + warmup_steps: 500 + constant_steps: 0 + min_lr: 1e-5 + data: + data_impl: mmap + splits_string: "99990,8,2" + seq_length: 4096 + skip_warmup: true + num_workers: 2 + dataloader_type: single + reset_position_ids: false + reset_attention_mask: false + eod_mask_loss: false + index_mapping_dir: null + data_prefix: + - .0333 + - ${data_dir}/my-llama_00_text_document + - .0333 + - ${data_dir}/my-llama_01_text_document + - .0333 + - ${data_dir}/my-llama_00_text_document + - .0333 + - ${data_dir}/my-llama_01_text_document + - .0333 + - ${data_dir}/my-llama_02_text_document + - .0333 + - ${data_dir}/my-llama_03_text_document + - .0333 + - ${data_dir}/my-llama_04_text_document + - .0333 + - ${data_dir}/my-llama_05_text_document + - .0333 + - ${data_dir}/my-llama_06_text_document + - .0333 + - ${data_dir}/my-llama_07_text_document + - .0333 + - ${data_dir}/my-llama_08_text_document + - .0333 + - ${data_dir}/my-llama_09_text_document + - .0333 + - ${data_dir}/my-llama_10_text_document + - .0333 + - ${data_dir}/my-llama_11_text_document + - .0333 + - ${data_dir}/my-llama_12_text_document + - .0333 + - ${data_dir}/my-llama_13_text_document + - .0333 + - ${data_dir}/my-llama_14_text_document + - .0333 + - ${data_dir}/my-llama_15_text_document + - .0333 + - ${data_dir}/my-llama_16_text_document + - .0333 + - ${data_dir}/my-llama_17_text_document + - .0333 + - ${data_dir}/my-llama_18_text_document + - .0333 + - ${data_dir}/my-llama_19_text_document + - .0333 + - ${data_dir}/my-llama_20_text_document + - .0333 + - ${data_dir}/my-llama_21_text_document + - .0333 + - ${data_dir}/my-llama_22_text_document + - .0333 + - ${data_dir}/my-llama_23_text_document + - .0333 + - ${data_dir}/my-llama_24_text_document + - .0333 + - ${data_dir}/my-llama_25_text_document + - .0333 + - ${data_dir}/my-llama_26_text_document + - .0333 + - ${data_dir}/my-llama_27_text_document + - .0333 + - ${data_dir}/my-llama_28_text_document + - .0334 + - ${data_dir}/my-llama_29_text_document + From fd414f930c890d1766f4aae89d0b685559cba05a Mon Sep 17 00:00:00 2001 From: Hongbin Liu Date: Thu, 24 Aug 2023 05:43:59 -0700 Subject: [PATCH 27/62] update prompt learning for llama2 Signed-off-by: Hongbin Liu --- .../conf/prompt_learning/llama/squad.yaml | 20 ++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/launcher_scripts/conf/prompt_learning/llama/squad.yaml b/launcher_scripts/conf/prompt_learning/llama/squad.yaml index 51104ba17d..4336a568bc 100755 --- a/launcher_scripts/conf/prompt_learning/llama/squad.yaml +++ b/launcher_scripts/conf/prompt_learning/llama/squad.yaml @@ -3,26 +3,25 @@ run: time_limit: "01:00:00" dependency: "singleton" convert_name: convert_nemo - model_train_name: llama_7b + model_train_name: llama2_7b convert_dir: ${base_results_dir}/${prompt_learning.run.model_train_name}/${prompt_learning.run.convert_name} task_name: "squad" results_dir: ${base_results_dir}/${.model_train_name}/prompt_learning_${.task_name} trainer: devices: 8 - num_nodes: 1 + num_nodes: 4 accelerator: gpu precision: bf16 logger: False enable_checkpointing: False - replace_sampler_ddp: False + use_distributed_sampler: False max_epochs: 4 max_steps: -1 log_every_n_steps: 10 val_check_interval: 200 accumulate_grad_batches: 1 gradient_clip_val: 1.0 - resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc. exp_manager: explicit_log_dir: ${prompt_learning.run.results_dir}/results @@ -32,7 +31,7 @@ exp_manager: wandb_logger_kwargs: project: nemo_llama_prompt name: ${prompt_learning.run.name} - resume_if_exists: True + resume_if_exists: False resume_ignore_no_checkpoint: True create_checkpoint_callback: True checkpoint_callback_params: @@ -46,14 +45,14 @@ exp_manager: model: seed: 1234 - nemo_path: ${prompt_learning.run.results_dir}/results/megatron_gpt_prompt.nemo # the place to save prompt learning nemo checkpoint + nemo_path: ${prompt_learning.run.results_dir}/results/megatron_llama_prompt.nemo # the place to save prompt learning nemo checkpoint virtual_prompt_style: 'p-tuning' # One of 'p-tuning', 'prompt-tuning', or 'inference'. We recommend 'p-tuning' over 'prompt-tuning'. tensor_model_parallel_size: 2 pipeline_model_parallel_size: 1 model_parallel_size: ${multiply:${.tensor_model_parallel_size}, ${.pipeline_model_parallel_size}} - encoder_seq_length: 2048 - global_batch_size: 64 - micro_batch_size: 8 + encoder_seq_length: 4096 + global_batch_size: 8 + micro_batch_size: 1 restore_path: null # used to restore from a prompt tuned checkpoint and add new tasks language_model_path: ${prompt_learning.run.convert_dir}/results/megatron_llama.nemo # Restore lanugage model from pre-trained .nemo checkpoint @@ -64,6 +63,9 @@ model: # Makes tensor parallelism more memory efficient for LLMs (20B+) by parallelizing layer norms and dropout sequentially # See Reducing Activation Recomputation in Large Transformer Models: https://arxiv.org/abs/2205.05198 for more details. sequence_parallel: False + activations_checkpoint_granularity: selective + activations_checkpoint_num_layers: 1 + activations_checkpoint_method: block task_templates: # task_templates for all existing_tasks and new_tasks are required. - taskname: "squad" # The task name From b38b964ee3be5251599efb1a7881af0580edc4b3 Mon Sep 17 00:00:00 2001 From: Hongbin Liu Date: Thu, 24 Aug 2023 05:46:41 -0700 Subject: [PATCH 28/62] update conversion for llama2 Signed-off-by: Hongbin Liu --- launcher_scripts/conf/conversion/llama/convert_llama.yaml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/launcher_scripts/conf/conversion/llama/convert_llama.yaml b/launcher_scripts/conf/conversion/llama/convert_llama.yaml index 451d916b20..9dfb362cc2 100755 --- a/launcher_scripts/conf/conversion/llama/convert_llama.yaml +++ b/launcher_scripts/conf/conversion/llama/convert_llama.yaml @@ -5,16 +5,16 @@ run: dependency: "singleton" ntasks_per_node: ${divide_ceil:${conversion.model.model_parallel_size}, ${.nodes}} convert_name: convert_nemo - model_train_name: llama_7b + model_train_name: llama2_7b train_dir: ${base_results_dir}/${.model_train_name} results_dir: ${.train_dir}/${.convert_name} - nemo_file_name: megatron_llama_prompt.nemo # name of nemo checkpoint; must be .nemo file + nemo_file_name: megatron_llama.nemo # name of nemo checkpoint; must be .nemo file model: model_type: gpt # gpt or t5, use t5 for mt5 as well - checkpoint_folder: ${conversion.run.train_dir}/prompt_learning_squad/results/checkpoints + checkpoint_folder: ${conversion.run.train_dir}/results/checkpoints checkpoint_name: latest # latest OR name pattern of a checkpoint (e.g. megatron_llama-*last.ckpt) - hparams_file: ${conversion.run.train_dir}/prompt_learning_squad/results/hparams.yaml + hparams_file: ${conversion.run.train_dir}/results/hparams.yaml tensor_model_parallel_size: 2 pipeline_model_parallel_size: 1 model_parallel_size: ${multiply:${.tensor_model_parallel_size}, ${.pipeline_model_parallel_size}} From 633367e9967b3881446b2900a359632864ee2edf Mon Sep 17 00:00:00 2001 From: Hongbin Liu Date: Thu, 24 Aug 2023 05:53:41 -0700 Subject: [PATCH 29/62] update evaluation script for llama2 Signed-off-by: Hongbin Liu --- .../conf/evaluation/llama/evaluate_all.yaml | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/launcher_scripts/conf/evaluation/llama/evaluate_all.yaml b/launcher_scripts/conf/evaluation/llama/evaluate_all.yaml index ca4d9b7456..047f72b866 100755 --- a/launcher_scripts/conf/evaluation/llama/evaluate_all.yaml +++ b/launcher_scripts/conf/evaluation/llama/evaluate_all.yaml @@ -1,11 +1,11 @@ run: name: ${.eval_name}_${.model_train_name} - time_limit: "01:00:00" + time_limit: "02:00:00" dependency: "singleton" nodes: ${divide_ceil:${evaluation.model.model_parallel_size}, 8} # 8 gpus per node ntasks_per_node: ${divide_ceil:${evaluation.model.model_parallel_size}, ${.nodes}} eval_name: eval_all - model_train_name: llama_7b + model_train_name: llama2_7b train_dir: ${base_results_dir}/${.model_train_name} tasks: all_tasks # supported: lambada, boolq, race, piqa, hellaswag, winogrande, wikitext2, wikitext103 OR all_tasks results_dir: ${base_results_dir}/${.model_train_name}/${.eval_name} @@ -13,12 +13,12 @@ run: model: model_type: nemo-llama nemo_model: null # run eval with a .nemo file, produced when converted interleaved checkpoints - checkpoint_folder: ${evaluation.run.train_dir}/results/checkpoints - checkpoint_name: latest # latest OR name pattern of a checkpoint (e.g. megatron_gpt-*last.ckpt) - hparams_file: ${evaluation.run.train_dir}/results/hparams.yaml + #checkpoint_folder: ${evaluation.run.train_dir}/results/checkpoints + #checkpoint_name: latest # latest OR name pattern of a checkpoint (e.g. megatron_gpt-*last.ckpt) + #hparams_file: ${evaluation.run.train_dir}/results/hparams.yaml tensor_model_parallel_size: 2 pipeline_model_parallel_size: 1 model_parallel_size: ${multiply:${.tensor_model_parallel_size}, ${.pipeline_model_parallel_size}} precision: bf16 # must match training precision - 32, 16 or bf16 eval_batch_size: 4 - tokenizer_model: ${data_dir}/llama/llama_tokenizer.model + #tokenizer_model: ${data_dir}/llama/llama_tokenizer.model From f5cee87301a843699d468c39d3136138e14d22e6 Mon Sep 17 00:00:00 2001 From: Hongbin Liu Date: Thu, 24 Aug 2023 05:55:18 -0700 Subject: [PATCH 30/62] update evaluation scripts for llama2 Signed-off-by: Hongbin Liu --- .../eval_harness/lm_eval/models/nemo_gpt3.py | 31 ++++++++++++++----- .../eval_harness/lm_eval/models/nemo_llama.py | 3 +- 2 files changed, 26 insertions(+), 8 deletions(-) diff --git a/launcher_scripts/nemo_launcher/collections/eval_harness/lm_eval/models/nemo_gpt3.py b/launcher_scripts/nemo_launcher/collections/eval_harness/lm_eval/models/nemo_gpt3.py index d20eb89c69..e79f5808cc 100755 --- a/launcher_scripts/nemo_launcher/collections/eval_harness/lm_eval/models/nemo_gpt3.py +++ b/launcher_scripts/nemo_launcher/collections/eval_harness/lm_eval/models/nemo_gpt3.py @@ -35,11 +35,11 @@ class RequestDataset(Dataset): - def __init__(self, requests, tokenizer) -> None: + def __init__(self, requests, tokenizer, max_length) -> None: super().__init__() self.requests = requests self.tokenizer = tokenizer - self.max_length = 2048 + self.max_length = max_length def __len__(self): return len(self.requests) @@ -148,12 +148,29 @@ def dummy(): logging.info(f'Setting up transformer engine modules for tensor parallelism.') if model.cfg.get('megatron_amp_O2', 'False'): # when using O2 additional module key is added that casts the weights - for layer in model.model.module.language_model.encoder.layers: - layer.set_tensor_parallel_group(parallel_state.get_tensor_model_parallel_group()) + if model.cfg.get('mcore_gpt', False): + for layer in model.model.module.decoder.layers: + layer.set_tensor_parallel_group(parallel_state.get_tensor_model_parallel_group()) + else: + for layer in model.model.module.language_model.encoder.layers: + layer.set_tensor_parallel_group(parallel_state.get_tensor_model_parallel_group()) else: - for layer in model.model.language_model.encoder.layers: - layer.set_tensor_parallel_group(parallel_state.get_tensor_model_parallel_group()) + if model.cfg.get('mcore_gpt', False): + for module in model.get_gpt_module_list(): + """Set TP group + Copied from: https://github.com/NVIDIA/TransformerEngine/blob/main/transformer_engine/pytorch/transformer.py#L398 + """ + # Deep iterate but skip self to avoid infinite recursion. + for index, child in enumerate(module.modules()): + if index == 0: + continue + if hasattr(child, "set_tensor_parallel_group"): + tp_group = parallel_state.get_tensor_model_parallel_group() + child.set_tensor_parallel_group(tp_group) + else: + for layer in model.model.language_model.encoder.layers: + layer.set_tensor_parallel_group(parallel_state.get_tensor_model_parallel_group()) class NeMo_GPT3LM_TP_PP(LM): @@ -241,7 +258,7 @@ def _collate(x): # used to reorder request and remove duplications return -len(toks), tuple(toks) reord = utils.Reorderer(requests, _collate) - request_ds = RequestDataset(reord.get_reordered(), self.model.tokenizer) + request_ds = RequestDataset(reord.get_reordered(), self.model.tokenizer, self.max_length) request_dl = DataLoader(request_ds, collate_fn=pad_collate, batch_size=self.batch_size, shuffle=False) def logits_to_results(batch, response): diff --git a/launcher_scripts/nemo_launcher/collections/eval_harness/lm_eval/models/nemo_llama.py b/launcher_scripts/nemo_launcher/collections/eval_harness/lm_eval/models/nemo_llama.py index 462d28f549..975bdf4b2e 100755 --- a/launcher_scripts/nemo_launcher/collections/eval_harness/lm_eval/models/nemo_llama.py +++ b/launcher_scripts/nemo_launcher/collections/eval_harness/lm_eval/models/nemo_llama.py @@ -120,7 +120,7 @@ def _collate(x): # used to reorder request and remove duplications return -len(toks), tuple(toks) reord = utils.Reorderer(requests, _collate) - request_ds = RequestDataset(reord.get_reordered(), self.model.tokenizer) + request_ds = RequestDataset(reord.get_reordered(), self.model.tokenizer, self.max_length) request_dl = DataLoader(request_ds, collate_fn=pad_collate, batch_size=self.batch_size, shuffle=False) def logits_to_results(batch, response): @@ -171,6 +171,7 @@ def logits_to_results(batch, response): greedy=True, repetition_penalty=1.0, min_tokens_to_generate=0, + compute_logprob=True, ) response = get_computeprob_response(self.tokenizer, response, inputs) From 4f3421b8ae8fe6f3511c5f6be7166441196c87a9 Mon Sep 17 00:00:00 2001 From: Hongbin Liu Date: Thu, 24 Aug 2023 05:56:17 -0700 Subject: [PATCH 31/62] revert adding nemo_dir Signed-off-by: Hongbin Liu --- launcher_scripts/nemo_launcher/core/stages.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/launcher_scripts/nemo_launcher/core/stages.py b/launcher_scripts/nemo_launcher/core/stages.py index 3277996eb1..4410bd3adf 100755 --- a/launcher_scripts/nemo_launcher/core/stages.py +++ b/launcher_scripts/nemo_launcher/core/stages.py @@ -213,9 +213,8 @@ def add_container_mounts(container_mounts): cfg = self.cfg data_dir = cfg.get("data_dir") - nemo_dir = cfg.get("nemo_dir") base_results_dir = cfg.get("base_results_dir") - mounts_string = f"{self._launcher_scripts_path}:{self._launcher_scripts_path},{data_dir}:{data_dir},{base_results_dir}:{base_results_dir},{nemo_dir}:{nemo_dir}" + mounts_string = f"{self._launcher_scripts_path}:{self._launcher_scripts_path},{data_dir}:{data_dir},{base_results_dir}:{base_results_dir}" container_mounts = cfg.get("container_mounts") mounts_string += add_container_mounts(container_mounts) @@ -425,9 +424,10 @@ def get_job_path(self, sub_stage: Optional = None) -> JobPaths: @property def _set_ln_sm_margin(self) -> str: """ Set LayerNorm SM margin when using P2P communication overlap to support the overlap with LayerNorm kernel """ + vpp = self.cfg.training.model.get("virtual_pipeline_model_parallel_size") if (self.cfg.training.model.get("overlap_p2p_comm", False) and self.cfg.training.model.get("pipeline_model_parallel_size") > 1 and - self.cfg.training.model.get("virtual_pipeline_model_parallel_size") > 1): + vpp is not None and vpp > 1): get_ln_sm_margin_command = ( f"python3 {self._launcher_scripts_path / 'nemo_launcher/collections/conditional_cfgs.py'} " f"name=get_ln_sm_margin" From 50ba136a5e8ca8187db6afac628851f1ff8d737f Mon Sep 17 00:00:00 2001 From: Hongbin Liu Date: Thu, 24 Aug 2023 06:00:54 -0700 Subject: [PATCH 32/62] add llama2 config for auto-configurator Signed-off-by: Hongbin Liu --- .../base_configs/llama2_13b.yaml | 220 +++++++++++++++++ .../base_configs/llama2_70b.yaml | 225 ++++++++++++++++++ auto_configurator/base_configs/llama2_7b.yaml | 223 +++++++++++++++++ .../conf/search_config/llama/13b.yaml | 40 ++++ .../conf/search_config/llama/70b.yaml | 40 ++++ .../conf/search_config/llama/7b.yaml | 40 ++++ 6 files changed, 788 insertions(+) create mode 100644 auto_configurator/base_configs/llama2_13b.yaml create mode 100644 auto_configurator/base_configs/llama2_70b.yaml create mode 100755 auto_configurator/base_configs/llama2_7b.yaml create mode 100644 auto_configurator/conf/search_config/llama/13b.yaml create mode 100644 auto_configurator/conf/search_config/llama/70b.yaml create mode 100644 auto_configurator/conf/search_config/llama/7b.yaml diff --git a/auto_configurator/base_configs/llama2_13b.yaml b/auto_configurator/base_configs/llama2_13b.yaml new file mode 100644 index 0000000000..a77584a33d --- /dev/null +++ b/auto_configurator/base_configs/llama2_13b.yaml @@ -0,0 +1,220 @@ +run: + name: llama2_13b + results_dir: ${base_results_dir}/${.name} + time_limit: 0-01:00:00 + dependency: singleton +trainer: + num_nodes: 4 + devices: 8 + accelerator: gpu + precision: bf16 + logger: false + enable_checkpointing: false + use_distributed_sampler: false + max_epochs: null + max_steps: 300000 + max_time: '5:23:30:00' + log_every_n_steps: 10 + val_check_interval: 2000 + limit_val_batches: 32 + limit_test_batches: 50 + accumulate_grad_batches: 1 + gradient_clip_val: 1.0 + num_sanity_val_steps: 0 +exp_manager: + explicit_log_dir: ${training.run.results_dir}/results + exp_dir: null + name: megatron_llama + create_wandb_logger: true + wandb_logger_kwargs: + project: nemo_llama_pretrain + name: ${training.run.name} + resume_if_exists: false + resume_ignore_no_checkpoint: true + create_checkpoint_callback: True + checkpoint_callback_params: + monitor: val_loss + save_top_k: 10 + mode: min + always_save_nemo: false + save_nemo_on_train_end: true + filename: megatron_llama--{val_loss:.2f}-{step}-{consumed_samples} + model_parallel_size: ${multiply:${training.model.tensor_model_parallel_size}, + ${training.model.pipeline_model_parallel_size}} + log_step_timing: true + step_timing_kwargs: + sync_cuda: true + buffer_size: 5 +model: + mcore_gpt: true + micro_batch_size: 2 + global_batch_size: 128 + rampup_batch_size: null + tensor_model_parallel_size: 4 + pipeline_model_parallel_size: 1 + virtual_pipeline_model_parallel_size: null + encoder_seq_length: 4096 + max_position_embeddings: 4096 + num_layers: 40 + hidden_size: 5120 + ffn_hidden_size: 13824 + num_attention_heads: 40 + init_method_std: 0.01 + use_scaled_init_method: true + hidden_dropout: 0.0 + attention_dropout: 0.0 + ffn_dropout: 0.0 + kv_channels: null + apply_query_key_layer_scaling: true + normalization: rmsnorm + layernorm_epsilon: 1.0e-05 + do_layer_norm_weight_decay: false + make_vocab_size_divisible_by: 128 + pre_process: true + post_process: true + persist_layer_norm: true + bias: false + activation: fast-swiglu + headscale: false + transformer_block_type: pre_ln + openai_gelu: false + normalize_attention_scores: true + position_embedding_type: rope + rotary_percentage: 1.0 + attention_type: multihead + share_embeddings_and_output_weights: false + tokenizer: + library: sentencepiece + type: null + model: ${data_dir}/llama/llama_tokenizer.model + delimiter: null + vocab_file: null + merge_file: null + sentencepiece_legacy: false + native_amp_init_scale: 4294967296 + native_amp_growth_interval: 1000 + hysteresis: 2 + fp32_residual_connection: false + fp16_lm_cross_entropy: false + megatron_amp_O2: true + grad_allreduce_chunk_size_mb: 125 + grad_div_ar_fusion: true + gradient_accumulation_fusion: true + bias_activation_fusion: false + bias_dropout_add_fusion: false + masked_softmax_fusion: true + seed: 1234 + resume_from_checkpoint: null + use_cpu_initialization: false + onnx_safe: false + apex_transformer_log_level: 30 + gradient_as_bucket_view: true + sync_batch_comm: false + activations_checkpoint_granularity: selective + activations_checkpoint_method: block + activations_checkpoint_num_layers: 0 + num_micro_batches_with_partial_activation_checkpoints: 0 + activations_checkpoint_layers_per_pipeline: 0 + sequence_parallel: true + transformer_engine: true + fp8: false + fp8_e4m3: false + fp8_hybrid: false + fp8_margin: 0 + fp8_interval: 1 + fp8_amax_history_len: 1 + fp8_amax_compute_algo: most_recent + use_emha: false + ub_tp_comm_overlap: false + use_flash_attention: true + optim: + name: distributed_fused_adam + lr: 0.0001 + weight_decay: 0.1 + betas: + - 0.9 + - 0.95 + bucket_cap_mb: 125 + overlap_grad_sync: false + sched: + name: CosineAnnealing + warmup_steps: 107 + constant_steps: 11873 + min_lr: 1.0e-05 + data: + data_impl: mmap + splits_string: 99990,8,2 + seq_length: 4096 + skip_warmup: true + num_workers: 2 + dataloader_type: single + reset_position_ids: false + reset_attention_mask: false + eod_mask_loss: false + index_mapping_dir: null + data_prefix: + - .0333 + - ${data_dir}/my-llama_00_text_document + - .0333 + - ${data_dir}/my-llama_01_text_document + - .0333 + - ${data_dir}/my-llama_00_text_document + - .0333 + - ${data_dir}/my-llama_01_text_document + - .0333 + - ${data_dir}/my-llama_02_text_document + - .0333 + - ${data_dir}/my-llama_03_text_document + - .0333 + - ${data_dir}/my-llama_04_text_document + - .0333 + - ${data_dir}/my-llama_05_text_document + - .0333 + - ${data_dir}/my-llama_06_text_document + - .0333 + - ${data_dir}/my-llama_07_text_document + - .0333 + - ${data_dir}/my-llama_08_text_document + - .0333 + - ${data_dir}/my-llama_09_text_document + - .0333 + - ${data_dir}/my-llama_10_text_document + - .0333 + - ${data_dir}/my-llama_11_text_document + - .0333 + - ${data_dir}/my-llama_12_text_document + - .0333 + - ${data_dir}/my-llama_13_text_document + - .0333 + - ${data_dir}/my-llama_14_text_document + - .0333 + - ${data_dir}/my-llama_15_text_document + - .0333 + - ${data_dir}/my-llama_16_text_document + - .0333 + - ${data_dir}/my-llama_17_text_document + - .0333 + - ${data_dir}/my-llama_18_text_document + - .0333 + - ${data_dir}/my-llama_19_text_document + - .0333 + - ${data_dir}/my-llama_20_text_document + - .0333 + - ${data_dir}/my-llama_21_text_document + - .0333 + - ${data_dir}/my-llama_22_text_document + - .0333 + - ${data_dir}/my-llama_23_text_document + - .0333 + - ${data_dir}/my-llama_24_text_document + - .0333 + - ${data_dir}/my-llama_25_text_document + - .0333 + - ${data_dir}/my-llama_26_text_document + - .0333 + - ${data_dir}/my-llama_27_text_document + - .0333 + - ${data_dir}/my-llama_28_text_document + - .0334 + - ${data_dir}/my-llama_29_text_document + diff --git a/auto_configurator/base_configs/llama2_70b.yaml b/auto_configurator/base_configs/llama2_70b.yaml new file mode 100644 index 0000000000..7697b36e0f --- /dev/null +++ b/auto_configurator/base_configs/llama2_70b.yaml @@ -0,0 +1,225 @@ +run: + name: llama2_70b + results_dir: ${base_results_dir}/${.name} + time_limit: 0-01:00:00 + dependency: singleton +trainer: + num_nodes: 8 + devices: 8 + accelerator: gpu + precision: bf16 + logger: false + enable_checkpointing: false + use_distributed_sampler: False + max_epochs: null + max_steps: 300000 + max_time: '19:23:30:00' + log_every_n_steps: 10 + val_check_interval: 2000 + limit_val_batches: 32 + limit_test_batches: 50 + accumulate_grad_batches: 1 + gradient_clip_val: 1.0 + num_sanity_val_steps: 0 +exp_manager: + explicit_log_dir: ${training.run.results_dir}/results + exp_dir: null + name: megatron_llama + create_wandb_logger: true + wandb_logger_kwargs: + project: nemo_llama_pretrain + name: ${training.run.name} + resume_if_exists: false + resume_ignore_no_checkpoint: true + create_checkpoint_callback: True + checkpoint_callback_params: + monitor: val_loss + save_top_k: 10 + mode: min + always_save_nemo: false + save_nemo_on_train_end: true + filename: megatron_llama--{val_loss:.2f}-{step}-{consumed_samples} + model_parallel_size: ${multiply:${training.model.tensor_model_parallel_size}, ${training.model.pipeline_model_parallel_size}} + log_step_timing: true + step_timing_kwargs: + sync_cuda: true + buffer_size: 5 +model: + mcore_gpt: true + micro_batch_size: 1 + global_batch_size: 128 + rampup_batch_size: null + tensor_model_parallel_size: 4 + pipeline_model_parallel_size: 4 + virtual_pipeline_model_parallel_size: null + encoder_seq_length: 4096 + max_position_embeddings: 4096 + num_layers: 80 + hidden_size: 8192 + ffn_hidden_size: 28672 + num_attention_heads: 64 + num_query_groups: 8 + init_method_std: 0.008944 + use_scaled_init_method: true + hidden_dropout: 0.0 + attention_dropout: 0.0 + ffn_dropout: 0.0 + kv_channels: null + apply_query_key_layer_scaling: true + normalization: rmsnorm + layernorm_epsilon: 1.0e-05 + do_layer_norm_weight_decay: false + make_vocab_size_divisible_by: 128 + pre_process: true + post_process: true + persist_layer_norm: true + bias: false + activation: fast-swiglu + headscale: false + transformer_block_type: pre_ln + openai_gelu: false + normalize_attention_scores: true + position_embedding_type: rope + rotary_percentage: 1.0 + attention_type: multihead + share_embeddings_and_output_weights: false + tokenizer: + library: sentencepiece + type: null + model: ${data_dir}/llama/llama_tokenizer.model + delimiter: null + vocab_file: null + merge_file: null + sentencepiece_legacy: false + native_amp_init_scale: 4294967296 + native_amp_growth_interval: 1000 + hysteresis: 2 + fp32_residual_connection: false + fp16_lm_cross_entropy: false + megatron_amp_O2: true + grad_allreduce_chunk_size_mb: 125 + grad_div_ar_fusion: true + gradient_accumulation_fusion: true + bias_activation_fusion: false + bias_dropout_add_fusion: false + masked_softmax_fusion: true + seed: 1234 + resume_from_checkpoint: null + use_cpu_initialization: false + onnx_safe: false + apex_transformer_log_level: 30 + gradient_as_bucket_view: true + sync_batch_comm: false + activations_checkpoint_granularity: selective + activations_checkpoint_method: block + activations_checkpoint_num_layers: 1 + num_micro_batches_with_partial_activation_checkpoints: 0 + activations_checkpoint_layers_per_pipeline: 0 + sequence_parallel: true + transformer_engine: true + fp8: false + fp8_e4m3: false + fp8_hybrid: false + fp8_margin: 0 + fp8_interval: 1 + fp8_amax_history_len: 1 + fp8_amax_compute_algo: most_recent + use_emha: false + ub_tp_comm_overlap: false + use_flash_attention: true + overlap_p2p_comm: false + batch_p2p_comm: true + gc_interval: 100 + optim: + name: distributed_fused_adam + lr: 0.00015 + weight_decay: 0.1 + betas: + - 0.9 + - 0.95 + bucket_cap_mb: 125 + overlap_grad_sync: true + overlap_param_sync: true + contiguous_grad_buffer: true + sched: + name: CosineAnnealing + warmup_steps: 2000 + constant_steps: 11873 + min_lr: 1.0e-05 + data: + data_impl: mmap + splits_string: 99990,8,2 + seq_length: 4096 + skip_warmup: true + num_workers: 2 + dataloader_type: single + reset_position_ids: false + reset_attention_mask: false + eod_mask_loss: false + index_mapping_dir: null + data_prefix: + - .0333 + - ${data_dir}/my-llama_00_text_document + - .0333 + - ${data_dir}/my-llama_01_text_document + - .0333 + - ${data_dir}/my-llama_00_text_document + - .0333 + - ${data_dir}/my-llama_01_text_document + - .0333 + - ${data_dir}/my-llama_02_text_document + - .0333 + - ${data_dir}/my-llama_03_text_document + - .0333 + - ${data_dir}/my-llama_04_text_document + - .0333 + - ${data_dir}/my-llama_05_text_document + - .0333 + - ${data_dir}/my-llama_06_text_document + - .0333 + - ${data_dir}/my-llama_07_text_document + - .0333 + - ${data_dir}/my-llama_08_text_document + - .0333 + - ${data_dir}/my-llama_09_text_document + - .0333 + - ${data_dir}/my-llama_10_text_document + - .0333 + - ${data_dir}/my-llama_11_text_document + - .0333 + - ${data_dir}/my-llama_12_text_document + - .0333 + - ${data_dir}/my-llama_13_text_document + - .0333 + - ${data_dir}/my-llama_14_text_document + - .0333 + - ${data_dir}/my-llama_15_text_document + - .0333 + - ${data_dir}/my-llama_16_text_document + - .0333 + - ${data_dir}/my-llama_17_text_document + - .0333 + - ${data_dir}/my-llama_18_text_document + - .0333 + - ${data_dir}/my-llama_19_text_document + - .0333 + - ${data_dir}/my-llama_20_text_document + - .0333 + - ${data_dir}/my-llama_21_text_document + - .0333 + - ${data_dir}/my-llama_22_text_document + - .0333 + - ${data_dir}/my-llama_23_text_document + - .0333 + - ${data_dir}/my-llama_24_text_document + - .0333 + - ${data_dir}/my-llama_25_text_document + - .0333 + - ${data_dir}/my-llama_26_text_document + - .0333 + - ${data_dir}/my-llama_27_text_document + - .0333 + - ${data_dir}/my-llama_28_text_document + - .0334 + - ${data_dir}/my-llama_29_text_document + diff --git a/auto_configurator/base_configs/llama2_7b.yaml b/auto_configurator/base_configs/llama2_7b.yaml new file mode 100755 index 0000000000..39222af385 --- /dev/null +++ b/auto_configurator/base_configs/llama2_7b.yaml @@ -0,0 +1,223 @@ +run: + name: llama2_7b + results_dir: ${base_results_dir}/${.name} + time_limit: "0-01:00:00" + dependency: "singleton" +trainer: + num_nodes: 2 + devices: 8 + accelerator: gpu + precision: bf16 + logger: False # logger provided by exp_manager + enable_checkpointing: False + use_distributed_sampler: False + max_epochs: null + max_steps: 300000 # consumed_samples = global_step * global_batch_size + max_time: "05:23:30:00" # days:hours:minutes:seconds + log_every_n_steps: 10 + val_check_interval: 2000 + limit_val_batches: 32 + limit_test_batches: 50 + accumulate_grad_batches: 1 + gradient_clip_val: 1.0 + num_sanity_val_steps: 0 +exp_manager: + explicit_log_dir: ${training.run.results_dir}/results + exp_dir: null + name: megatron_llama + create_wandb_logger: true + wandb_logger_kwargs: + project: nemo_llama_pretrain + name: ${training.run.name} + resume_if_exists: false + resume_ignore_no_checkpoint: true + create_checkpoint_callback: True + checkpoint_callback_params: + monitor: val_loss + save_top_k: 10 + mode: min + always_save_nemo: False # saves nemo file during validation, not implemented for model parallel + save_nemo_on_train_end: False # not recommended when training large models on clusters with short time limits + filename: 'megatron_llama--{val_loss:.2f}-{step}-{consumed_samples}' + model_parallel_size: ${multiply:${training.model.tensor_model_parallel_size}, ${training.model.pipeline_model_parallel_size}} + log_step_timing: True + step_timing_kwargs: + sync_cuda: True + buffer_size: 5 + +model: + mcore_gpt: true + micro_batch_size: 2 + global_batch_size: 64 + rampup_batch_size: null + tensor_model_parallel_size: 2 + pipeline_model_parallel_size: 1 + virtual_pipeline_model_parallel_size: null + encoder_seq_length: 4096 + max_position_embeddings: 4096 + num_layers: 32 + hidden_size: 4096 + ffn_hidden_size: 11008 + num_attention_heads: 32 + init_method_std: 0.01 + use_scaled_init_method: true + hidden_dropout: 0.0 + attention_dropout: 0.0 + ffn_dropout: 0.0 + kv_channels: null + apply_query_key_layer_scaling: true + normalization: rmsnorm + layernorm_epsilon: 1.0e-05 + do_layer_norm_weight_decay: false + make_vocab_size_divisible_by: 128 + pre_process: true + post_process: true + persist_layer_norm: true + bias: false + activation: fast-swiglu + headscale: false + transformer_block_type: pre_ln + openai_gelu: false + normalize_attention_scores: true + position_embedding_type: rope + rotary_percentage: 1.0 + attention_type: multihead + share_embeddings_and_output_weights: false + tokenizer: + library: 'sentencepiece' + type: null + model: ${data_dir}/llama/llama_tokenizer.model + delimiter: null + vocab_file: null + merge_file: null + sentencepiece_legacy: False + native_amp_init_scale: 4294967296 + native_amp_growth_interval: 1000 + hysteresis: 2 + fp32_residual_connection: false + fp16_lm_cross_entropy: false + megatron_amp_O2: true + grad_allreduce_chunk_size_mb: 125 + grad_div_ar_fusion: true + gradient_accumulation_fusion: true + bias_activation_fusion: false + bias_dropout_add_fusion: false + masked_softmax_fusion: true + seed: 1234 + resume_from_checkpoint: null + use_cpu_initialization: false + onnx_safe: false + apex_transformer_log_level: 30 + gradient_as_bucket_view: true + sync_batch_comm: false + activations_checkpoint_granularity: selective + activations_checkpoint_method: block + activations_checkpoint_num_layers: 0 + num_micro_batches_with_partial_activation_checkpoints: null + activations_checkpoint_layers_per_pipeline: null + sequence_parallel: true # does not support sequence parallel + + ## Transformer Engine + # fp8 training is currently not supported in the improved models + transformer_engine: true + fp8: False # enables fp8 in TransformerLayer forward + fp8_e4m3: False # sets fp8_format = recipe.Format.E4M3 + fp8_hybrid: False # sets fp8_format = recipe.Format.HYBRID + fp8_margin: 0 # scaling margin + fp8_interval: 1 # scaling update interval + fp8_amax_history_len: 1024 # Number of steps for which amax history is recorded per tensor + fp8_amax_compute_algo: max # 'most_recent' or 'max'. Algorithm for computing amax from history + use_emha: False + ub_tp_comm_overlap: False + use_flash_attention: true + optim: + name: distributed_fused_adam + lr: 1e-4 + weight_decay: 0.1 + betas: + - 0.9 + - 0.95 + bucket_cap_mb: 125 + overlap_grad_sync: False + sched: + name: CosineAnnealing + warmup_steps: 500 + constant_steps: 0 + min_lr: 1e-5 + data: + data_impl: mmap + splits_string: "99990,8,2" + seq_length: 4096 + skip_warmup: true + num_workers: 2 + dataloader_type: single + reset_position_ids: false + reset_attention_mask: false + eod_mask_loss: false + index_mapping_dir: null + data_prefix: + - .5 + - ${data_dir}/my-llama_00_text_document + - .5 + - ${data_dir}/my-llama_01_text_document + # - .0333 + # - ${data_dir}/my-gpt3_00_text_document + # - .0333 + # - ${data_dir}/my-gpt3_01_text_document + # - .0333 + # - ${data_dir}/my-gpt3_02_text_document + # - .0333 + # - ${data_dir}/my-gpt3_03_text_document + # - .0333 + # - ${data_dir}/my-gpt3_04_text_document + # - .0333 + # - ${data_dir}/my-gpt3_05_text_document + # - .0333 + # - ${data_dir}/my-gpt3_06_text_document + # - .0333 + # - ${data_dir}/my-gpt3_07_text_document + # - .0333 + # - ${data_dir}/my-gpt3_08_text_document + # - .0333 + # - ${data_dir}/my-gpt3_09_text_document + # - .0333 + # - ${data_dir}/my-gpt3_10_text_document + # - .0333 + # - ${data_dir}/my-gpt3_11_text_document + # - .0333 + # - ${data_dir}/my-gpt3_12_text_document + # - .0333 + # - ${data_dir}/my-gpt3_13_text_document + # - .0333 + # - ${data_dir}/my-gpt3_14_text_document + # - .0333 + # - ${data_dir}/my-gpt3_15_text_document + # - .0333 + # - ${data_dir}/my-gpt3_16_text_document + # - .0333 + # - ${data_dir}/my-gpt3_17_text_document + # - .0333 + # - ${data_dir}/my-gpt3_18_text_document + # - .0333 + # - ${data_dir}/my-gpt3_19_text_document + # - .0333 + # - ${data_dir}/my-gpt3_20_text_document + # - .0333 + # - ${data_dir}/my-gpt3_21_text_document + # - .0333 + # - ${data_dir}/my-gpt3_22_text_document + # - .0333 + # - ${data_dir}/my-gpt3_23_text_document + # - .0333 + # - ${data_dir}/my-gpt3_24_text_document + # - .0333 + # - ${data_dir}/my-gpt3_25_text_document + # - .0333 + # - ${data_dir}/my-gpt3_26_text_document + # - .0333 + # - ${data_dir}/my-gpt3_27_text_document + # - .0333 + # - ${data_dir}/my-gpt3_28_text_document + # - .0334 + # - ${data_dir}/my-gpt3_29_text_document + diff --git a/auto_configurator/conf/search_config/llama/13b.yaml b/auto_configurator/conf/search_config/llama/13b.yaml new file mode 100644 index 0000000000..e18a5f242c --- /dev/null +++ b/auto_configurator/conf/search_config/llama/13b.yaml @@ -0,0 +1,40 @@ +train_settings: + model_size_in_b: 13 # unit in billion parameters + num_nodes: 4 + gpus_per_node: 8 + gpu_memory_gb: 80 # Memory per GPU, in GB. Currently 40GB and 80GB A100s supported. + max_training_days: 5 # unit in days + limit_search_runs: 1 # Max number of runs to be launched in parallel for grid search. + output_top_n: 10 # The result will print the top N fastest training configs. + max_steps_per_run: 50 # Max steps per run for the grid search. + max_minutes_per_run: 30 # minutes per run for the grid search. + tflops_per_gpu: 150 # Estimated tflops per GPU. + num_tokens_in_b: 300 # Unit in billions, typically 300B for GPT3 models. + vocab_size: 32000 + seq_length: 4096 # available seq_length list for GPT-3 models: [2048, 4096, 8192, 16384, 32768] + custom_config: /lustre/fsw/devtech/hpc-devtech/hongbinl/nemo_megatron/scripts/support_llama/NeMo-Megatron-Launcher/auto_configurator/base_configs/llama2_13b.yaml # path to custom .yaml model config instead of using auto-generated + logs: ${base_results_dir}/${search_config_value}_${.gpu_memory_gb}gb # Example base_results_dir/gpt3/126m + tensor_parallel_sizes: [2] # auto to use our recommendation, or a list, such as [1, 2, 4, 8] + pipeline_parallel_sizes: [1,2] # auto to use our recommendation, or a list, such as [1, 2, 4, 8, 10] + min_model_parallel_size: auto # auto to use our recommendation, or a value for the minimum desired parallelism + max_model_parallel_size: auto # auto to use our recommendation, or a value for the maximum desired parallelism + micro_batch_sizes: [1,2] # auto to use our recommendation, or a list, such as [1, 2, 4, 8, 16] + act_ckpt_layers: auto # auto to use our recommendation, or a list, such as [0, 1, 2, 3] + +inference_settings: + run: + model_type: gpt3 + model_train_name: gpt3_5b + gpus_per_node: 8 + data_type: "fp16" # fp32|fp16|bf16 + time_limit: 0:30:00 + results_dir: ${base_results_dir}/${search_config_value}_${search_config.train_settings.gpu_memory_gb}gb + tensor_parallel_sizes: [1,2,4] + pipeline_parallel_sizes: [1,2] + benchmark: + input_len: 60 + output_len: 20 + batch_sizes: [4,8,16,32,64,128,256] + beam_width: 1 + topk: 4 + topp: 0.0 diff --git a/auto_configurator/conf/search_config/llama/70b.yaml b/auto_configurator/conf/search_config/llama/70b.yaml new file mode 100644 index 0000000000..eb2d089064 --- /dev/null +++ b/auto_configurator/conf/search_config/llama/70b.yaml @@ -0,0 +1,40 @@ +train_settings: + model_size_in_b: 70 # unit in billion parameters + num_nodes: 8 + gpus_per_node: 8 + gpu_memory_gb: 80 # Memory per GPU, in GB. Currently 40GB and 80GB A100s supported. + max_training_days: 5 # unit in days + limit_search_runs: 100 # Max number of runs to be launched in parallel for grid search. + output_top_n: 10 # The result will print the top N fastest training configs. + max_steps_per_run: 50 # Max steps per run for the grid search. + max_minutes_per_run: 30 # minutes per run for the grid search. + tflops_per_gpu: 150 # Estimated tflops per GPU. + num_tokens_in_b: 300 # Unit in billions, typically 300B for GPT3 models. + vocab_size: 32000 + seq_length: 4096 # available seq_length list for GPT-3 models: [2048, 4096, 8192, 16384, 32768] + custom_config: /lustre/fsw/devtech/hpc-devtech/hongbinl/nemo_megatron/scripts/support_llama/NeMo-Megatron-Launcher/auto_configurator/base_configs/llama2_70b.yaml # path to custom .yaml model config instead of using auto-generated + logs: ${base_results_dir}/${search_config_value}_${.gpu_memory_gb}gb # Example base_results_dir/gpt3/126m + tensor_parallel_sizes: auto # auto to use our recommendation, or a list, such as [1, 2, 4, 8] + pipeline_parallel_sizes: auto # auto to use our recommendation, or a list, such as [1, 2, 4, 8, 10] + min_model_parallel_size: auto # auto to use our recommendation, or a value for the minimum desired parallelism + max_model_parallel_size: auto # auto to use our recommendation, or a value for the maximum desired parallelism + micro_batch_sizes: auto # auto to use our recommendation, or a list, such as [1, 2, 4, 8, 16] + act_ckpt_layers: auto # auto to use our recommendation, or a list, such as [0, 1, 2, 3] + +inference_settings: + run: + model_type: gpt3 + model_train_name: gpt3_5b + gpus_per_node: 8 + data_type: "fp16" # fp32|fp16|bf16 + time_limit: 0:30:00 + results_dir: ${base_results_dir}/${search_config_value}_${search_config.train_settings.gpu_memory_gb}gb + tensor_parallel_sizes: [1,2,4] + pipeline_parallel_sizes: [1,2] + benchmark: + input_len: 60 + output_len: 20 + batch_sizes: [4,8,16,32,64,128,256] + beam_width: 1 + topk: 4 + topp: 0.0 diff --git a/auto_configurator/conf/search_config/llama/7b.yaml b/auto_configurator/conf/search_config/llama/7b.yaml new file mode 100644 index 0000000000..148f12ff6c --- /dev/null +++ b/auto_configurator/conf/search_config/llama/7b.yaml @@ -0,0 +1,40 @@ +train_settings: + model_size_in_b: 7 # unit in billion parameters + num_nodes: 2 + gpus_per_node: 8 + gpu_memory_gb: 80 # Memory per GPU, in GB. Currently 40GB and 80GB A100s supported. + max_training_days: 5 # unit in days + limit_search_runs: 1 # Max number of runs to be launched in parallel for grid search. + output_top_n: 10 # The result will print the top N fastest training configs. + max_steps_per_run: 50 # Max steps per run for the grid search. + max_minutes_per_run: 30 # minutes per run for the grid search. + tflops_per_gpu: 150 # Estimated tflops per GPU. + num_tokens_in_b: 300 # Unit in billions, typically 300B for GPT3 models. + vocab_size: 32000 + seq_length: 4096 # available seq_length list for GPT-3 models: [2048, 4096, 8192, 16384, 32768] + custom_config: /lustre/fsw/devtech/hpc-devtech/hongbinl/nemo_megatron/scripts/support_llama/NeMo-Megatron-Launcher/auto_configurator/base_configs/llama2_7b.yaml # path to custom .yaml model config instead of using auto-generated + logs: ${base_results_dir}/${search_config_value}_${.gpu_memory_gb}gb # Example base_results_dir/gpt3/126m + tensor_parallel_sizes: auto # auto to use our recommendation, or a list, such as [1, 2, 4, 8] + pipeline_parallel_sizes: auto # auto to use our recommendation, or a list, such as [1, 2, 4, 8, 10] + min_model_parallel_size: auto # auto to use our recommendation, or a value for the minimum desired parallelism + max_model_parallel_size: auto # auto to use our recommendation, or a value for the maximum desired parallelism + micro_batch_sizes: auto # auto to use our recommendation, or a list, such as [1, 2, 4, 8, 16] + act_ckpt_layers: auto # auto to use our recommendation, or a list, such as [0, 1, 2, 3] + +inference_settings: + run: + model_type: gpt3 + model_train_name: gpt3_5b + gpus_per_node: 8 + data_type: "fp16" # fp32|fp16|bf16 + time_limit: 0:30:00 + results_dir: ${base_results_dir}/${search_config_value}_${search_config.train_settings.gpu_memory_gb}gb + tensor_parallel_sizes: [1,2,4] + pipeline_parallel_sizes: [1,2] + benchmark: + input_len: 60 + output_len: 20 + batch_sizes: [4,8,16,32,64,128,256] + beam_width: 1 + topk: 4 + topp: 0.0 From 91865fb1ea85838d1ca3f43dd5cb9747f23d37d0 Mon Sep 17 00:00:00 2001 From: Hongbin Liu Date: Thu, 24 Aug 2023 06:14:21 -0700 Subject: [PATCH 33/62] minor fix for PR Signed-off-by: Hongbin Liu --- .../autoconfig/scripts/compare_throughput.py | 7 +++---- launcher_scripts/conf/cluster/bcm.yaml | 12 ++++++------ launcher_scripts/conf/config.yaml | 5 ++--- launcher_scripts/nemo_launcher/core/stages.py | 4 ++-- 4 files changed, 13 insertions(+), 15 deletions(-) diff --git a/auto_configurator/autoconfig/scripts/compare_throughput.py b/auto_configurator/autoconfig/scripts/compare_throughput.py index d3b5b3e6c2..dcb56fa833 100644 --- a/auto_configurator/autoconfig/scripts/compare_throughput.py +++ b/auto_configurator/autoconfig/scripts/compare_throughput.py @@ -17,7 +17,6 @@ def main(cfg): model_size = settings_cfg.model_size_in_b output_top_n = settings_cfg.output_top_n nodes = settings_cfg.num_nodes - #nodes = cfg.get("nodes") training_logs = os.path.join(settings_cfg.get("logs"), "training_logs") candidate_configs = os.path.join(settings_cfg.get("logs"), "candidate_configs") @@ -139,9 +138,9 @@ def main(cfg): ea.Reload() try: timing_list = ea.Scalars("train_step_timing") - #if len(timing_list) <= 6: - # continue - timing_list = [x.value for x in timing_list[0:]] + if len(timing_list) <= 6: + continue + timing_list = [x.value for x in timing_list[5:]] avg_global_step_time = round(sum(timing_list) / len(timing_list), 4) samples_per_s = round(gbs / avg_global_step_time, 2) m_tflops, m_tflops_gpu = calculate_tflops( diff --git a/launcher_scripts/conf/cluster/bcm.yaml b/launcher_scripts/conf/cluster/bcm.yaml index 8ff05b1fe3..ba8f2ebbb0 100755 --- a/launcher_scripts/conf/cluster/bcm.yaml +++ b/launcher_scripts/conf/cluster/bcm.yaml @@ -1,9 +1,9 @@ -partition: luna -account: devtech -exclusive: true +partition: null +account: null +exclusive: True gpus_per_task: null -gpus_per_node: null +gpus_per_node: 8 mem: 0 -job_name_prefix: 'devtech-gpt:' +job_name_prefix: 'nemo-megatron-' srun_args: - - --no-container-mount-home + - "--no-container-mount-home" diff --git a/launcher_scripts/conf/config.yaml b/launcher_scripts/conf/config.yaml index 9729c4901d..979644f9b1 100755 --- a/launcher_scripts/conf/config.yaml +++ b/launcher_scripts/conf/config.yaml @@ -31,15 +31,14 @@ stages: #- export cluster_type: bcm # bcm or bcp. If bcm, it must match - cluster above. -launcher_scripts_path: /lustre/fsw/devtech/hpc-devtech/hongbinl/nemo_megatron/scripts/support_llama/NeMo-Megatron-Launcher/launcher_scripts # Path to NeMo Megatron Launch scripts, should ends with /launcher_scripts +launcher_scripts_path: ??? # Path to NeMo Megatron Launch scripts, should ends with /launcher_scripts data_dir: ${launcher_scripts_path}/data # Location to store and read the data. -nemo_dir: /lustre/fsw/devtech/hpc-devtech/hongbinl/nemo_megatron/nemo_repo/internal/NeMo base_results_dir: ${launcher_scripts_path}/results # Location to store the results, checkpoints and logs. container_mounts: # List of additional paths to mount to container. They will be mounted to same path. - null container: nvcr.io/ea-bignlp/nemofw-training:23.07-py3 -wandb_api_key_file: /lustre/fsw/devtech/hpc-devtech/hongbinl/nemo_megatron/NeMo-Megatron-Launcher/wandb_api_key # File where the w&B api key is stored. Key must be on the first line. +wandb_api_key_file: null # File where the w&B api key is stored. Key must be on the first line. env_vars: NCCL_TOPO_FILE: null # Should be a path to an XML file describing the topology diff --git a/launcher_scripts/nemo_launcher/core/stages.py b/launcher_scripts/nemo_launcher/core/stages.py index 9864331805..aa6ac9c0bf 100755 --- a/launcher_scripts/nemo_launcher/core/stages.py +++ b/launcher_scripts/nemo_launcher/core/stages.py @@ -382,7 +382,7 @@ def _launcher_scripts_path(self) -> Path: @property def _nemo_code_path(self) -> Path: - return Path(self.cfg.get("nemo_dir", "/opt/NeMo")) + return Path("/opt/NeMo") @property def _data_dir(self) -> Path: @@ -675,6 +675,7 @@ def _get_nemo_code_path(self, model_type: str) -> Path: :return: path current stage's essential nemo scripts code :rtype: Path """ + model_type_to_code_path = { "gpt3" : self._nemo_code_path / "examples/nlp/language_modeling/tuning/megatron_gpt_sft.py", "t5": self._nemo_code_path / "examples/nlp/language_modeling/megatron_t5_seq2seq_finetune.py", @@ -966,7 +967,6 @@ class EvalHarnessEvaluation(NemoMegatronStage): def __init__(self, cfg): super().__init__(cfg) choice_model_type, choice_name = self.get_stage_config_choice() - #self.prompt_evaluation = choice_model_type == "prompt_gpt3" self.prompt_evaluation = True if "prompt" in choice_model_type else False def setup_stage_vars(self, cfg): From b78aa12f85fdb3a569dbbf3fa25815519d1a65ed Mon Sep 17 00:00:00 2001 From: Robert Clark Date: Fri, 25 Aug 2023 16:33:17 -0500 Subject: [PATCH 34/62] Remove README documentation updates A new user guide will replace the existing README moving forward and the k8s documentation here will no longer be necessary. For legacy purposes, the README updates will be included with the original kubernetes commit for standalone documentation of the kubernetes support. Signed-Off-By: Robert Clark --- README.md | 199 ++---------------------------------------------------- 1 file changed, 7 insertions(+), 192 deletions(-) diff --git a/README.md b/README.md index dbe58feb64..3df4288d54 100755 --- a/README.md +++ b/README.md @@ -19,7 +19,6 @@ The most recent version of the README can be found at [https://ngc.nvidia.com/co + [4.1.1. Common](#411-common) + [4.1.2. OCI](#412-oci) + [4.1.3. AWS](#413-aws) - + [4.1.4. Kubernetes](#414-k8s) * [4.2. Cluster Validation](#42-cluster-validation) + [4.2.1. Validation Script Usage](#421-validation-script-usage) + [4.2.2 Running tests manually](#422-running-tests-manually) @@ -33,14 +32,12 @@ The most recent version of the README can be found at [https://ngc.nvidia.com/co + [5.1.1. Prepare Environment](#511-prepare-environment) - [5.1.1.1. Slurm](#5111-slurm) - [5.1.1.2. Base Command Platform](#5112-base-command-platform) - - [5.1.1.3. Kubernetes](#5113-kubernetes) - - [5.1.1.4. General Configuration](#5114-general-configuration) + - [5.1.1.3. General Configuration](#5113-general-configuration) + [5.1.2. Data Preparation](#512-data-preparation) - [5.1.2.1. Data Preparation for GPT Models](#5121-data-preparation-for-gpt-models) * [5.1.2.1.1. Slurm](#51211-slurm) * [5.1.2.1.2. Base Command Platform](#51212-base-command-platform) - * [5.1.2.1.3. Kubernetes](#51213-kubernetes) - * [5.1.2.1.4. Common](#51214-common) + * [5.1.2.1.3. Common](#51213-common) - [5.1.2.2. Data Preparation for T5 Models](#5122-data-preparation-for-t5-models) * [5.1.2.2.1. Slurm](#51221-slurm) * [5.1.2.2.2. Base Command Platform](#51222-base-command-platform) @@ -88,7 +85,6 @@ The most recent version of the README can be found at [https://ngc.nvidia.com/co + [5.6.1. GPT Training](#561-gpt-training) - [5.6.1.1. Slurm](#5611-slurm) - [5.6.1.2. Base Command Platform](#5612-base-command-platform) - - [5.6.1.3. Kubernetes](#5613-base-command-platform) + [5.6.2. T5 Training](#562-t5-training) - [5.6.2.1. Slurm](#5621-slurm) - [5.6.2.2. Base Command Platform](#5622-base-command-platform) @@ -104,7 +100,6 @@ The most recent version of the README can be found at [https://ngc.nvidia.com/co - [5.8.1.1. Common](#5811-common) - [5.8.1.2. Slurm](#5812-slurm) - [5.8.1.3. Base Command Platform](#5813-base-command-platform) - - [5.8.1.4. Kubernetes](#5814-kubernetes) + [5.8.2. T5 Conversion](#582-t5-conversion) - [5.8.2.1. Common](#5821-common) - [5.8.2.2. Slurm](#5822-slurm) @@ -157,8 +152,7 @@ The most recent version of the README can be found at [https://ngc.nvidia.com/co - [5.13.1.1. Common](#51311-common) - [5.13.1.2. Slurm](#51312-slurm) - [5.13.1.3. Base Command Platform](#51313-base-command-platform) - - [5.13.1.4. Kubernetes](#51314-kubernetes) - - [5.13.1.5 Interleaved Pipeline Parallelism](#51314-interleaved-pipeline-parallelism) + - [5.13.1.4 Interleaved Pipeline Parallelism](#51314-interleaved-pipeline-parallelism) + [5.13.2. T5 Evaluation](#5132-t5-evaluation) - [5.13.2.1. Common](#51321-common) - [5.13.2.2. Slurm](#51322-slurm) @@ -377,11 +371,6 @@ Figure 1: The GPT family architecture. The 5B variant includes 24 transformer la | HPC-X | 2.13 | | Base Command Manager | 1.0.0 | | DeepOps | 21.06 | -| Kubernetes | 1.27.4 | -| Helm | 3.12.1 | -| GPU Operator | 23.3.2 | -| Network Operator | 23.1.0 | -| KubeFlow Operator | 1.6.0 | ## 4. Cloud Service Providers @@ -432,23 +421,6 @@ On the scheduler node: container: /path/to/nemo_megatron_launcher/nemo_megatron_training.sqsh ``` -#### 4.1.4. Kubernetes - -Data preparation and training GPT models is currently supported on vanilla kubernetes (k8s) clusters. -The launcher scripts will generate a Helm chart for each task based on the config files and launch the job using the chart. - -The following is required for running jobs on Kubernetes: - * One or more DGX A100s/H100s as worker nodes - * An NFS filesystem where the data and launcher scripts will be stored which is accessible on all worker and controller nodes - * A head/controller node which has access to the worker nodes and can run `kubectl` and `helm` to launch jobs and can install Python dependencies - * Recent versions of the GPU, Network, and KubeFlow Operators installed - -A secret key needs to be configured to allow kubernetes to pull from the private registry. For example, if pulling the container directly -from NGC, a secret needs to be created to authenticate with the private NGC registry, such as the following: -``` -kubectl create secret docker-registry ngc-registry --docker-server=nvcr.io --docker-username=\$oauthtoken --docker-password= -``` - ### 4.2. Cluster Validation @@ -632,22 +604,7 @@ creating these workspaces (e.g. `nemo_megatron_data_ws` and `nemo_megatron_resul the Base Command Platform User Guide for how to create and work with Base Command Platform workspaces. -##### 5.1.1.3. Kubernetes - - -The launcher scripts need to be downloaded to the NFS filesystem that is -connected to the worker nodes. This can either be copied at -`/opt/NeMo-Megatron-Launcher` from inside the training container or by cloning -this repository. - -Install the NeMo Framework scripts dependencies on the head node/controller of -the cluster where jobs will be launched: - -``` -pip install -r requirements.txt -``` - -##### 5.1.1.4. General Configuration +##### 5.1.1.3. General Configuration The first parameter that must be set is the `launcher_scripts_path` parameter inside the @@ -895,36 +852,8 @@ The command above assumes you want to prepare the entire dataset (files 0-29), a workspace in `/mount/data`, and the results workspace in `/mount/results`. Stdout and stderr are redirected to the `/results/data_gpt3_log.txt` file, so it can be downloaded from NGC. Any other parameter can also be added to the command to modify its behavior. -###### 5.1.2.1.3. Kubernetes - - -To run data preparation on a kubernetes cluster, set both the `cluster` and -`cluster_type` parameters to `k8s` in `conf/config.yaml`. Additionally, set the -`launcher_scripts_path` parameter to the location where the launcher scripts -are located on the NFS filesystem. This must be the same path on all nodes in -the cluster. Ensure the `stages` parameter is set to `data_preparation` and -`data_preparation` in the `defaults` section points to the intended data -preparation script. - -The `conf/config/k8s.yaml` file also needs to be updated with the -kubernetes container registry secret if created earlier (`pull_secret`), the -`shm_size` to determine how much local memory to put in each pod, and the NFS -server and path to where the launcher scripts are saved. These can all be -overridden from the command line using hydra as well. - -Once all of the config files are updated, the data preparation can be launched -from the controller node with: - -``` -python main.py -``` - -This will generate and launch a job via Helm in the default namespace which -can be viewed with `helm show` or `kubectl get pods`. The logs can be followed -with `kubectl logs `. - -###### 5.1.2.1.4. Common - +###### 5.1.2.1.3. Common + Set the configuration for the data preparation job for GPT models in the YAML file: ```yaml @@ -2533,89 +2462,6 @@ Select the cluster related configuration following the NGC documentation. Then, use the `python3 main.py` command to launch the job and override the desired parameters from the training job parameters. -##### 5.6.1.3. Kubernetes - - -Set configuration for your Kubernetes cluster in the `conf/cluster/k8s.yaml` file: - -```yaml -pull_secret: null -shm_size: 512Gi -nfs_server: null -nfs_path: null -ib_resource_name: "nvidia.com/hostdev" -ib_count: "8" -``` - -The settings are as follows: - * `pull_secret`: The name of the sercret key created with `kubectl` that will - be used to authenticate with private registries for pulling the training - container. - * `shm_size`: The amount of shared memory to include in the Pods. It is - recommended to use a large value here. - * `nfs_server`: The IP address or hostname of the NFS server that the worker - nodes will read and write data to/from. - * `nfs_path`: The absolute path on the NFS server that should be mounted - inside the Pods. - * `ib_resource_name`: The name of the IB interconnect to attach to Pods for - multi-node training. This is the name that Kubernetes assigns to the NICs as - allocatable resources. - * `ib_count`: The number of IB interconnects to include per node in each pod. - This will likely equal the total number of active/usable compute NICs per - node. - -And set the training job specific parameters in the `conf/training/(model_type)/(model_size).yaml` file, -using the run section: -```yaml -run: - name: gpt3_126m - results_dir: ${base_results_dir}/${.name} - time_limit: "1-12:00:00" - dependency: "singleton" -``` - -To run only the training pipeline and not the data preparation, evaluation or -inference pipelines, set the `conf/config.yaml` file to: - -```yaml -stages: - - training -``` - -Also set the `cluster` and `cluster_type` values to `k8s` in the -`conf/config.yaml` file. - -And then run: -``` -python3 main.py -``` - -Once the launcher is run, it will display the path to the Helm chart that was -generated based on the updated config files. The Helm chart will be located in -the job results directory by default. The chart will be run automatically and -Pods will be started by Kubernetes once resources become available. The status -of the Helm chart can be checked with: - -``` -$ helm list -NAME NAMESPACE REVISION UPDATED STATUS CHART APP VERSION -gpt-7b-improved default 1 2023-07-17 14:10:11.794541205 -0700 PDT deployed nemo-framework-training-1.0.0 1.0 -``` - -Once allocated, this will spin up N pods for N number of nodes requested. To -view training progress follow the log of the first pod, typically named -`nlp-training-worker-0`. - -Once a job is finished, it will be marked as complete via Helm and can be -uninstalled with (note - replace `` with the name of the Helm chart -as shown in the previous example): - -``` -$ helm uninstall -``` - -The uninstallation will not affect the completed job - it will only mark the -resources as free for Kubernetes to use them for future tasks. #### 5.6.2. T5 Training @@ -2903,22 +2749,6 @@ The command above assumes you mounted the data workspace in `/mount/data`, and t The stdout and stderr outputs will also be redirected to the `/results/convert_gpt3_log.txt` file, to be able to download the logs from NGC. Any other parameter can also be added to the command to modify its behavior. -##### 5.8.1.4. Kubernetes - -To convert a model to the `.nemo` format on a Kubernetes cluster, set both the -`cluster` and `cluster_type` parameters to `k8s` in `conf/config.yaml`. Update -the `conf/conversion/gpt3/convert_gpt3.yaml` config file to point to the model -you would like to convert. - -Once the configs are ready, run: - -``` -python3 main.py -``` - -This will launch a Helm chart that will spawn a job that runs on one of the -compute nodes to convert the requested model to the `.nemo` format. - #### 5.8.2. T5 Conversion @@ -4098,22 +3928,7 @@ The command above assumes you mounted the data workspace in `/mount/data`, and t The stdout and stderr outputs will also be redirected to the `/results/eval_gpt3_log.txt` file, to be able to download the logs from NGC. Any other parameter can also be added to the command to modify its behavior. -##### 5.13.1.4. Kubernetes - -To evaluate base models on Kubernetes clusters, set the `cluster` and -`cluster_type` parameters to `k8s` in `conf/config.yaml`. Update either the -`conf/evaluation/gpt3/evaluate_all.yaml` or `conf/evaluation/gpt3/evaluate_lambada.yaml` -file based on your cluster and desired evaluation tasks. Once the configurations -are updated, launch an evaluation job with: - -``` -python3 main.py -``` - -This will launch a Helm chart based on the evaluation configurations which will -download all task files and run evaluation against the specified model. - -##### 5.13.1.5 Interleaved Pipeline Parallelism +##### 5.13.1.4 Interleaved Pipeline Parallelism If your model was trained with interleaved pipeline parallelism, then the model must converted to a non-interleaved model. In order to check if your model used interleaved, inspect the training config and verify that From eef8c20972403b53e8615e8f9caafe02115ed265 Mon Sep 17 00:00:00 2001 From: "Zhenghang (Max) Xu" Date: Fri, 25 Aug 2023 21:30:30 -0700 Subject: [PATCH 35/62] Update squad.yaml Update squad.yaml based on update NeMo toolkit PEFT yaml file https://github.com/NVIDIA/NeMo/blob/main/examples/nlp/language_modeling/tuning/conf/megatron_gpt_peft_tuning_config.yaml. NeMo Launcher ptuning, lora and adapter tests could PASS on BCP. --- launcher_scripts/conf/peft/gpt3/squad.yaml | 123 +++++++++++---------- 1 file changed, 64 insertions(+), 59 deletions(-) diff --git a/launcher_scripts/conf/peft/gpt3/squad.yaml b/launcher_scripts/conf/peft/gpt3/squad.yaml index e3da77bba7..fec6a0e8ad 100644 --- a/launcher_scripts/conf/peft/gpt3/squad.yaml +++ b/launcher_scripts/conf/peft/gpt3/squad.yaml @@ -11,20 +11,18 @@ run: results_dir: ${base_results_dir}/${.model_train_name}/peft_${.task_name} trainer: - devices: 8 - num_nodes: 1 + devices: 1 accelerator: gpu - precision: bf16 - logger: False + num_nodes: 1 + precision: 16 + logger: False # logger provided by exp_manager enable_checkpointing: False - replace_sampler_ddp: False - max_epochs: 4 - max_steps: -1 - log_every_n_steps: 10 - val_check_interval: 200 - accumulate_grad_batches: 1 + use_distributed_sampler: False + max_epochs: 9999 + max_steps: 20000 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches + log_every_n_steps: 10 # frequency with which training steps are logged + val_check_interval: 200 # If is an int n > 1, will run val every n training steps, if a float 0.0 - 1.0 will run val every epoch fraction, e.g. 0.25 will run val every quarter epoch gradient_clip_val: 1.0 - resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc. exp_manager: explicit_log_dir: null @@ -62,7 +60,7 @@ model: global_batch_size: 128 micro_batch_size: 4 - restore_from_path: ${peft.run.convert_dir}/results/megatron_gpt.nemo # Path to an existing .nemo model you wish to add new tasks to or run inference with + restore_from_path: ??? # Path to an existing .nemo model you wish to add new tasks to or run inference with resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc. save_nemo_on_validation_end: False # Saves an inference ready .nemo file every time a checkpoint is saved during training. sync_batch_comm: False @@ -80,6 +78,7 @@ model: # of each chunk at the specified granularity # 'block' checkpoints the specified number of layers per pipeline stage at the specified granularity activations_checkpoint_num_layers: null # not used with 'selective' + activations_checkpoint_layers_per_pipeline: null answer_only_loss: True gradient_as_bucket_view: False @@ -88,7 +87,7 @@ model: ffn_dropout: 0.0 peft: - peft_scheme: "ptuning" # can be either adapter,ia3, or ptuning + peft_scheme: "adapter" # can be either adapter,ia3, or ptuning restore_from_path: null # Used for adapter peft training @@ -96,16 +95,22 @@ model: type: 'parallel_adapter' # this should be either 'parallel_adapter' or 'linear_adapter' adapter_dim: 32 adapter_dropout: 0.0 - norm_position: 'pre' # This can be set to 'pre' or 'post', 'pre' is normally what is used. + norm_position: 'pre' # This can be set to 'pre', 'post' or null, 'pre' is normally what is used. column_init_method: 'xavier' # IGNORED if linear_adapter is used, options: xavier, zero or normal row_init_method: 'zero' # IGNORED if linear_adapter is used, options: xavier, zero or normal norm_type: 'mixedfusedlayernorm' # IGNORED if layer_adapter is used, options are ['layernorm', 'mixedfusedlayernorm'] + layer_selection: null # selects in which layers to add adapters, e.g. [1,12] will add adapters to layer 1 (lowest) and 12. null will apply adapters to all layers + weight_tying: False + position_embedding_strategy: null # used only when weight_tying is True lora_tuning: adapter_dim: 32 adapter_dropout: 0.0 column_init_method: 'xavier' # IGNORED if linear_adapter is used, options: xavier, zero or normal row_init_method: 'zero' # IGNORED if linear_adapter is used, options: xavier, zero or normal + layer_selection: null # selects in which layers to add lora adapters. e.g. [1,12] will add lora to layer 1 (lowest) and 12. null will apply adapters to all layers + weight_tying: False + position_embedding_strategy: null # used only when weight_tying is True # Used for p-tuning peft training p_tuning: @@ -113,9 +118,11 @@ model: bottleneck_dim: 1024 # the size of the prompt encoder mlp bottleneck embedding_dim: 1024 # the size of the prompt encoder embeddings init_std: 0.023 - + + ia3_tuning: + layer_selection: null # selects in which layers to add ia3 adapters. e.g. [1,12] will add lora to layer 1 (lowest) and 12. null will apply adapters to all layers + data: - chat: False # whether use chatbot data or not train_ds: # Example of how to specify paths to multiple datasets # file_names: @@ -124,12 +131,12 @@ model: # - /path/to/boolq.jsonl # Example of how each dataset is formatted # {'input': 'John von Neumann\nVon Neumann made fundamental contributions .... Q: What did the math of artificial viscosity do?', 'output': 'smoothed the shock transition without sacrificing basic physics'} - file_names: - - ${data_dir}/squad_data/v1.1/train-v1.1_gpt.json # Path to a list of JSONL files corresponding to the source data. + file_names: ${peft.model.data.validation_ds.file_names} # Path to a list of JSONL files corresponding to the source data. global_batch_size: ${peft.model.global_batch_size} micro_batch_size: ${peft.model.micro_batch_size} shuffle: True - num_workers: 4 + num_workers: 0 + memmap_workers: 2 pin_memory: True max_seq_length: 2048 min_seq_length: 1 @@ -139,59 +146,29 @@ model: # - 0.5 # - 0.25 # - 0.25 - concat_sampling_probabilities: - - 1.0 # When providing a list of datasets, this arg defines the sampling probabilities from each dataset when strategy='random' + concat_sampling_probabilities: null # When providing a list of datasets, this arg defines the sampling probabilities from each dataset when strategy='random' context_key: 'input' label_key: 'output' add_eos: True add_sep: False add_bos: False - separate_prompt_and_response_with_newline: True + separate_prompt_and_response_with_newline: False truncation_field: "context" # Options: ['context', 'answer'] index_mapping_dir: null # Path to a directory to write index mapping files. prompt_template: "{input} {output}" # fstring to use for assistant prompt. Example: "Q: {input}\nA: {output}" validation_ds: - file_names: - - ${data_dir}/squad_data/v1.1/dev-v1.1_gpt.json # Path to a list of JSONL files corresponding to the source data. Data format is identical to train_ds. - names: - - ${peft.run.task_name} # Names of the corresponding datasets used to log metrics. - global_batch_size: ${peft.model.global_batch_size} - micro_batch_size: ${peft.model.micro_batch_size} - shuffle: True - num_workers: 4 - pin_memory: True - max_seq_length: ${peft.model.data.train_ds.max_seq_length} - min_seq_length: ${peft.model.data.train_ds.min_seq_length} - drop_last: True - context_key: 'input' - label_key: 'output' - add_eos: ${peft.model.data.train_ds.add_eos} - add_sep: ${peft.model.data.train_ds.add_sep} - add_bos: ${peft.model.data.train_ds.add_bos} - separate_prompt_and_response_with_newline: ${peft.model.data.train_ds.separate_prompt_and_response_with_newline} - write_predictions_to_file: False - output_file_path_prefix: null # Prefix of the file to write predictions to. - truncation_field: "context" # Options: ['context', 'answer'] - index_mapping_dir: null # Path to a directory to write index mapping files. - prompt_template: ${peft.model.data.train_ds.prompt_template} # fstring to use for assistant prompt. Example: "Q: {input}\nA: {output}" - - metric: - name: "loss" # Name of the evaluation metric to use. Options: ['exact_string_match', 'loss'] - average: null # Average the metric over the dataset. Options: ['macro', 'micro']. Works only for 'F1', 'accuracy' etc. Refer to torchmetrics for metrics where this is supported. - num_classes: null - - test_ds: file_names: ${peft.model.data.validation_ds.file_names} # Path to a list of JSONL files corresponding to the source data. Data format is identical to train_ds. names: null # Names of the corresponding datasets used to log metrics. global_batch_size: ${peft.model.global_batch_size} micro_batch_size: ${peft.model.micro_batch_size} - shuffle: True - num_workers: 4 + shuffle: False + num_workers: 0 + memmap_workers: ${peft.model.data.train_ds.memmap_workers} pin_memory: True - max_seq_length: ${peft.model.data.train_ds.max_seq_length} - min_seq_length: ${peft.model.data.train_ds.min_seq_length} - drop_last: True + max_seq_length: 2048 + min_seq_length: 1 + drop_last: False context_key: 'input' label_key: 'output' add_eos: ${peft.model.data.train_ds.add_eos} @@ -203,12 +180,40 @@ model: truncation_field: "context" # Options: ['context', 'answer'] index_mapping_dir: null # Path to a directory to write index mapping files. prompt_template: ${peft.model.data.train_ds.prompt_template} # fstring to use for assistant prompt. Example: "Q: {input}\nA: {output}" - + tokens_to_generate: 32 # decide how many tokens we want to generate to evaluate performance with string metrics metric: name: "loss" # Name of the evaluation metric to use. Options: ['exact_string_match', 'loss'] average: null # Average the metric over the dataset. Options: ['macro', 'micro']. Works only for 'F1', 'accuracy' etc. Refer to torchmetrics for metrics where this is supported. num_classes: null - + test_ds: + file_names: ${peft.model.data.validation_ds.file_names} # Path to a list of JSONL files corresponding to the source data. Data format is identical to train_ds. + names: null # Names of the corresponding datasets used to log metrics. + global_batch_size: ${peft.model.global_batch_size} + micro_batch_size: ${peft.model.micro_batch_size} + shuffle: False + num_workers: 0 + memmap_workers: ${peft.model.data.train_ds.memmap_workers} + pin_memory: True + max_seq_length: 2048 + min_seq_length: 1 + drop_last: False + context_key: 'input' + label_key: 'output' + add_eos: ${peft.model.data.train_ds.add_eos} + add_sep: ${peft.model.data.train_ds.add_sep} + add_bos: ${peft.model.data.train_ds.add_bos} + separate_prompt_and_response_with_newline: ${peft.model.data.train_ds.separate_prompt_and_response_with_newline} + write_predictions_to_file: False + output_file_path_prefix: null # Prefix of the file to write predictions to. + truncation_field: "context" # Options: ['context', 'answer'] + index_mapping_dir: null # Path to a directory to write index mapping files. + prompt_template: ${peft.model.data.train_ds.prompt_template} + tokens_to_generate: 32 # decide how many tokens we want to generate to evaluate performance with string metrics + metric: + name: "loss" # Name of the evaluation metric to use. Options: ['exact_string_match', 'loss'] + average: null # Average the metric over the dataset. Options: ['macro', 'micro']. Works only for 'F1', 'accuracy' etc. Refer to torchmetrics for metrics where this is supported. + num_classes: null + optim: name: fused_adam lr: 1e-4 From 64cce78192157570073e83b43b362e4d3203487b Mon Sep 17 00:00:00 2001 From: Hongbin Liu Date: Wed, 30 Aug 2023 19:38:07 -0700 Subject: [PATCH 36/62] add end_string for llama evaluation Signed-off-by: Hongbin Liu --- .../collections/eval_harness/lm_eval/models/nemo_llama.py | 1 + 1 file changed, 1 insertion(+) diff --git a/launcher_scripts/nemo_launcher/collections/eval_harness/lm_eval/models/nemo_llama.py b/launcher_scripts/nemo_launcher/collections/eval_harness/lm_eval/models/nemo_llama.py index 975bdf4b2e..4c7f5e56b8 100755 --- a/launcher_scripts/nemo_launcher/collections/eval_harness/lm_eval/models/nemo_llama.py +++ b/launcher_scripts/nemo_launcher/collections/eval_harness/lm_eval/models/nemo_llama.py @@ -172,6 +172,7 @@ def logits_to_results(batch, response): repetition_penalty=1.0, min_tokens_to_generate=0, compute_logprob=True, + end_strings=[''], ) response = get_computeprob_response(self.tokenizer, response, inputs) From d8b4e4d1048699a3ec877cbb2a45bda83e7f29be Mon Sep 17 00:00:00 2001 From: Hongbin Liu Date: Wed, 30 Aug 2023 19:45:43 -0700 Subject: [PATCH 37/62] update evaluation configs for llama Signed-off-by: Hongbin Liu --- .../conf/evaluation/llama/evaluate_all.yaml | 2 +- .../conf/evaluation/llama/evaluate_boolq.yaml | 24 +++++++++++++++++++ 2 files changed, 25 insertions(+), 1 deletion(-) create mode 100755 launcher_scripts/conf/evaluation/llama/evaluate_boolq.yaml diff --git a/launcher_scripts/conf/evaluation/llama/evaluate_all.yaml b/launcher_scripts/conf/evaluation/llama/evaluate_all.yaml index 047f72b866..e354d6ee68 100755 --- a/launcher_scripts/conf/evaluation/llama/evaluate_all.yaml +++ b/launcher_scripts/conf/evaluation/llama/evaluate_all.yaml @@ -16,7 +16,7 @@ model: #checkpoint_folder: ${evaluation.run.train_dir}/results/checkpoints #checkpoint_name: latest # latest OR name pattern of a checkpoint (e.g. megatron_gpt-*last.ckpt) #hparams_file: ${evaluation.run.train_dir}/results/hparams.yaml - tensor_model_parallel_size: 2 + tensor_model_parallel_size: 1 pipeline_model_parallel_size: 1 model_parallel_size: ${multiply:${.tensor_model_parallel_size}, ${.pipeline_model_parallel_size}} precision: bf16 # must match training precision - 32, 16 or bf16 diff --git a/launcher_scripts/conf/evaluation/llama/evaluate_boolq.yaml b/launcher_scripts/conf/evaluation/llama/evaluate_boolq.yaml new file mode 100755 index 0000000000..49ba25236c --- /dev/null +++ b/launcher_scripts/conf/evaluation/llama/evaluate_boolq.yaml @@ -0,0 +1,24 @@ +run: + name: ${.eval_name}_${.model_train_name} + time_limit: "02:00:00" + dependency: "singleton" + nodes: ${divide_ceil:${evaluation.model.model_parallel_size}, 8} # 8 gpus per node + ntasks_per_node: ${divide_ceil:${evaluation.model.model_parallel_size}, ${.nodes}} + eval_name: eval_boolq + model_train_name: llama2_7b + train_dir: ${base_results_dir}/${.model_train_name} + tasks: boolq # supported: lambada, boolq, race, piqa, hellaswag, winogrande, wikitext2, wikitext103 OR all_tasks + results_dir: ${base_results_dir}/${.model_train_name}/${.eval_name} + +model: + model_type: nemo-llama + nemo_model: null # run eval with a .nemo file, produced when converted interleaved checkpoints + #checkpoint_folder: ${evaluation.run.train_dir}/results/checkpoints + #checkpoint_name: latest # latest OR name pattern of a checkpoint (e.g. megatron_gpt-*last.ckpt) + #hparams_file: ${evaluation.run.train_dir}/results/hparams.yaml + tensor_model_parallel_size: 1 + pipeline_model_parallel_size: 1 + model_parallel_size: ${multiply:${.tensor_model_parallel_size}, ${.pipeline_model_parallel_size}} + precision: bf16 # must match training precision - 32, 16 or bf16 + eval_batch_size: 4 + #tokenizer_model: ${data_dir}/llama/llama_tokenizer.model From a9a7de409729e0839b9d92d66461313bdfca899c Mon Sep 17 00:00:00 2001 From: dimapihtar Date: Fri, 1 Sep 2023 13:44:17 -0700 Subject: [PATCH 38/62] config changes Signed-off-by: dimapihtar --- launcher_scripts/conf/training/gpt3/126m.yaml | 6 ++++++ launcher_scripts/conf/training/gpt3/175b.yaml | 8 ++++++-- launcher_scripts/conf/training/gpt3/175b_performance.yaml | 4 ++++ launcher_scripts/conf/training/gpt3/1b_improved.yaml | 8 +++++++- launcher_scripts/conf/training/gpt3/20b.yaml | 6 ++++++ launcher_scripts/conf/training/gpt3/400m_improved.yaml | 8 +++++++- launcher_scripts/conf/training/gpt3/40b.yaml | 6 ++++++ launcher_scripts/conf/training/gpt3/40b_improved.yaml | 8 +++++++- launcher_scripts/conf/training/gpt3/5b.yaml | 6 ++++++ launcher_scripts/conf/training/gpt3/7b_improved.yaml | 8 +++++++- 10 files changed, 62 insertions(+), 6 deletions(-) diff --git a/launcher_scripts/conf/training/gpt3/126m.yaml b/launcher_scripts/conf/training/gpt3/126m.yaml index affee0765e..2e8cd73053 100755 --- a/launcher_scripts/conf/training/gpt3/126m.yaml +++ b/launcher_scripts/conf/training/gpt3/126m.yaml @@ -64,6 +64,7 @@ model: num_attention_heads: 12 init_method_std: 0.023 # Standard deviation of the zero mean normal distribution used for weight initialization.') hidden_dropout: 0.1 # Dropout probability for hidden state transformer. + attention_dropout: 0.1 # Dropout probability for attention kv_channels: null # Projection weights dimension in multi-head attention. Set to hidden_size // num_attention_heads if null apply_query_key_layer_scaling: True # scale Q * K^T by 1 / layer-number. layernorm_epsilon: 1e-5 @@ -90,6 +91,10 @@ model: ## Sequence Parallelism sequence_parallel: False + overlap_p2p_comm: False # Overlap p2p communication with computes. This argument is valid only when `virtual_pipeline_model_parallel_size` is larger than 1 + batch_p2p_comm: True # Batch consecutive inter-peer send/recv operations. This argument is valid only when `virtual_pipeline_model_parallel_size` is larger than 1 + num_query_groups: null # Number of query groups for group query attention. If None, normal attention is used. + tokenizer: library: 'megatron' type: 'GPT2BPETokenizer' @@ -120,6 +125,7 @@ model: fp8_interval: 1 # scaling update interval fp8_amax_history_len: 1024 # Number of steps for which amax history is recorded per tensor fp8_amax_compute_algo: max # 'most_recent' or 'max'. Algorithm for computing amax from history + fp8_wgrad: True use_emha: False ub_tp_comm_overlap: False diff --git a/launcher_scripts/conf/training/gpt3/175b.yaml b/launcher_scripts/conf/training/gpt3/175b.yaml index 493d24d516..971bd4ccdb 100755 --- a/launcher_scripts/conf/training/gpt3/175b.yaml +++ b/launcher_scripts/conf/training/gpt3/175b.yaml @@ -63,6 +63,7 @@ model: num_attention_heads: 96 init_method_std: 0.006 # Standard deviation of the zero mean normal distribution used for weight initialization.') hidden_dropout: 0.1 # Dropout probability for hidden state transformer. + attention_dropout: 0.1 # Dropout probability for attention kv_channels: null # Projection weights dimension in multi-head attention. Set to hidden_size // num_attention_heads if null apply_query_key_layer_scaling: True # scale Q * K^T by 1 / layer-number. layernorm_epsilon: 1e-5 @@ -89,6 +90,10 @@ model: ## Sequence Parallelism sequence_parallel: True + overlap_p2p_comm: False # Overlap p2p communication with computes. This argument is valid only when `virtual_pipeline_model_parallel_size` is larger than 1 + batch_p2p_comm: True # Batch consecutive inter-peer send/recv operations. This argument is valid only when `virtual_pipeline_model_parallel_size` is larger than 1 + num_query_groups: null # Number of query groups for group query attention. If None, normal attention is used. + tokenizer: library: 'megatron' type: 'GPT2BPETokenizer' @@ -119,6 +124,7 @@ model: fp8_interval: 1 # scaling update interval fp8_amax_history_len: 1024 # Number of steps for which amax history is recorded per tensor fp8_amax_compute_algo: max # 'most_recent' or 'max'. Algorithm for computing amax from history + fp8_wgrad: True use_emha: False ub_tp_comm_overlap: True @@ -128,8 +134,6 @@ model: use_cpu_initialization: False # Init weights on the CPU (slow for large models) onnx_safe: False # Use work-arounds for known problems with Torch ONNX exporter. apex_transformer_log_level: 30 # Python logging level displays logs with severity greater than or equal to this - overlap_p2p_comm: True # Overlap p2p communication with computes - batch_p2p_comm: False # Batch consecutive inter-peer send/recv operations # Nsys profiling options nsys_profile: diff --git a/launcher_scripts/conf/training/gpt3/175b_performance.yaml b/launcher_scripts/conf/training/gpt3/175b_performance.yaml index 976deda501..780e636ba8 100755 --- a/launcher_scripts/conf/training/gpt3/175b_performance.yaml +++ b/launcher_scripts/conf/training/gpt3/175b_performance.yaml @@ -65,6 +65,7 @@ model: num_attention_heads: 96 init_method_std: 0.006 # Standard deviation of the zero mean normal distribution used for weight initialization.') hidden_dropout: 0.1 # Dropout probability for hidden state transformer. + attention_dropout: 0.1 kv_channels: null # Projection weights dimension in multi-head attention. Set to hidden_size // num_attention_heads if null apply_query_key_layer_scaling: True # scale Q * K^T by 1 / layer-number. layernorm_epsilon: 1e-5 @@ -90,6 +91,8 @@ model: ## Sequence Parallelism sequence_parallel: True + + num_query_groups: null # Number of query groups for group query attention. If None, normal attention is used. tokenizer: library: 'megatron' @@ -121,6 +124,7 @@ model: fp8_interval: 1 # scaling update interval fp8_amax_history_len: 1024 # Number of steps for which amax history is recorded per tensor fp8_amax_compute_algo: max # 'most_recent' or 'max'. Algorithm for computing amax from history + fp8_wgrad: True use_emha: False ub_tp_comm_overlap: True diff --git a/launcher_scripts/conf/training/gpt3/1b_improved.yaml b/launcher_scripts/conf/training/gpt3/1b_improved.yaml index 1ff6b3dbf0..e6d473c840 100644 --- a/launcher_scripts/conf/training/gpt3/1b_improved.yaml +++ b/launcher_scripts/conf/training/gpt3/1b_improved.yaml @@ -66,7 +66,8 @@ model: ffn_dropout: 0.0 kv_channels: null apply_query_key_layer_scaling: true - normalization: layernorm1p + normalization: LayerNorm + layernorm_zero_centered_gamma: True layernorm_epsilon: 1.0e-05 do_layer_norm_weight_decay: false make_vocab_size_divisible_by: 128 @@ -116,6 +117,10 @@ model: activations_checkpoint_layers_per_pipeline: null sequence_parallel: false # does not support sequence parallel + overlap_p2p_comm: False # Overlap p2p communication with computes. This argument is valid only when `virtual_pipeline_model_parallel_size` is larger than 1 + batch_p2p_comm: True # Batch consecutive inter-peer send/recv operations. This argument is valid only when `virtual_pipeline_model_parallel_size` is larger than 1 + num_query_groups: null # Number of query groups for group query attention. If None, normal attention is used. + ## Transformer Engine # fp8 training is currently not supported in the improved models transformer_engine: False @@ -126,6 +131,7 @@ model: fp8_interval: 1 # scaling update interval fp8_amax_history_len: 1024 # Number of steps for which amax history is recorded per tensor fp8_amax_compute_algo: max # 'most_recent' or 'max'. Algorithm for computing amax from history + fp8_wgrad: True use_emha: False ub_tp_comm_overlap: False diff --git a/launcher_scripts/conf/training/gpt3/20b.yaml b/launcher_scripts/conf/training/gpt3/20b.yaml index e48788e197..b4185d922f 100755 --- a/launcher_scripts/conf/training/gpt3/20b.yaml +++ b/launcher_scripts/conf/training/gpt3/20b.yaml @@ -63,6 +63,7 @@ model: num_attention_heads: 48 init_method_std: 0.008165 # Standard deviation of the zero mean normal distribution used for weight initialization.') hidden_dropout: 0.1 # Dropout probability for hidden state transformer. + attention_dropout: 0.1 # Dropout probability for attention kv_channels: null # Projection weights dimension in multi-head attention. Set to hidden_size // num_attention_heads if null apply_query_key_layer_scaling: True # scale Q * K^T by 1 / layer-number. layernorm_epsilon: 1e-5 @@ -89,6 +90,10 @@ model: ## Sequence Parallelism sequence_parallel: True + overlap_p2p_comm: False # Overlap p2p communication with computes. This argument is valid only when `virtual_pipeline_model_parallel_size` is larger than 1 + batch_p2p_comm: True # Batch consecutive inter-peer send/recv operations. This argument is valid only when `virtual_pipeline_model_parallel_size` is larger than 1 + num_query_groups: null # Number of query groups for group query attention. If None, normal attention is used. + tokenizer: library: 'megatron' type: 'GPT2BPETokenizer' @@ -119,6 +124,7 @@ model: fp8_interval: 1 # scaling update interval fp8_amax_history_len: 1024 # Number of steps for which amax history is recorded per tensor fp8_amax_compute_algo: max # 'most_recent' or 'max'. Algorithm for computing amax from history + fp8_wgrad: True use_emha: False ub_tp_comm_overlap: True diff --git a/launcher_scripts/conf/training/gpt3/400m_improved.yaml b/launcher_scripts/conf/training/gpt3/400m_improved.yaml index 5b1e6b915f..e4b4a6e31f 100644 --- a/launcher_scripts/conf/training/gpt3/400m_improved.yaml +++ b/launcher_scripts/conf/training/gpt3/400m_improved.yaml @@ -66,7 +66,8 @@ model: ffn_dropout: 0.0 kv_channels: null apply_query_key_layer_scaling: true - normalization: layernorm1p + normalization: LayerNorm + layernorm_zero_centered_gamma: True layernorm_epsilon: 1.0e-05 do_layer_norm_weight_decay: false make_vocab_size_divisible_by: 128 @@ -116,6 +117,10 @@ model: activations_checkpoint_layers_per_pipeline: null sequence_parallel: false # does not support sequence parallel + overlap_p2p_comm: False # Overlap p2p communication with computes. This argument is valid only when `virtual_pipeline_model_parallel_size` is larger than 1 + batch_p2p_comm: True # Batch consecutive inter-peer send/recv operations. This argument is valid only when `virtual_pipeline_model_parallel_size` is larger than 1 + num_query_groups: null # Number of query groups for group query attention. If None, normal attention is used. + ## Transformer Engine # fp8 training is currently not supported in the improved models transformer_engine: False @@ -126,6 +131,7 @@ model: fp8_interval: 1 # scaling update interval fp8_amax_history_len: 1024 # Number of steps for which amax history is recorded per tensor fp8_amax_compute_algo: max # 'most_recent' or 'max'. Algorithm for computing amax from history + fp8_wgrad: True use_emha: False ub_tp_comm_overlap: False diff --git a/launcher_scripts/conf/training/gpt3/40b.yaml b/launcher_scripts/conf/training/gpt3/40b.yaml index 84c1802bc9..bf660ea9b4 100755 --- a/launcher_scripts/conf/training/gpt3/40b.yaml +++ b/launcher_scripts/conf/training/gpt3/40b.yaml @@ -63,6 +63,7 @@ model: num_attention_heads: 64 init_method_std: 0.007 # Standard deviation of the zero mean normal distribution used for weight initialization.') hidden_dropout: 0.1 # Dropout probability for hidden state transformer. + attention_dropout: 0.1 kv_channels: null # Projection weights dimension in multi-head attention. Set to hidden_size // num_attention_heads if null apply_query_key_layer_scaling: True # scale Q * K^T by 1 / layer-number. layernorm_epsilon: 1e-5 @@ -89,6 +90,10 @@ model: ## Sequence Parallelism sequence_parallel: True + overlap_p2p_comm: False # Overlap p2p communication with computes. This argument is valid only when `virtual_pipeline_model_parallel_size` is larger than 1 + batch_p2p_comm: True # Batch consecutive inter-peer send/recv operations. This argument is valid only when `virtual_pipeline_model_parallel_size` is larger than 1 + num_query_groups: null # Number of query groups for group query attention. If None, normal attention is used. + tokenizer: library: 'megatron' type: 'GPT2BPETokenizer' @@ -119,6 +124,7 @@ model: fp8_interval: 1 # scaling update interval fp8_amax_history_len: 1024 # Number of steps for which amax history is recorded per tensor fp8_amax_compute_algo: max # 'most_recent' or 'max'. Algorithm for computing amax from history + fp8_wgrad: True use_emha: False ub_tp_comm_overlap: True diff --git a/launcher_scripts/conf/training/gpt3/40b_improved.yaml b/launcher_scripts/conf/training/gpt3/40b_improved.yaml index 8686a171be..af5f14f2b2 100644 --- a/launcher_scripts/conf/training/gpt3/40b_improved.yaml +++ b/launcher_scripts/conf/training/gpt3/40b_improved.yaml @@ -66,7 +66,8 @@ model: ffn_dropout: 0.0 kv_channels: null apply_query_key_layer_scaling: true - normalization: layernorm1p + normalization: LayerNorm + layernorm_zero_centered_gamma: True layernorm_epsilon: 1.0e-05 do_layer_norm_weight_decay: false make_vocab_size_divisible_by: 128 @@ -116,6 +117,10 @@ model: activations_checkpoint_layers_per_pipeline: null sequence_parallel: false # does not support sequence parallel + overlap_p2p_comm: False # Overlap p2p communication with computes. This argument is valid only when `virtual_pipeline_model_parallel_size` is larger than 1 + batch_p2p_comm: True # Batch consecutive inter-peer send/recv operations. This argument is valid only when `virtual_pipeline_model_parallel_size` is larger than 1 + num_query_groups: null # Number of query groups for group query attention. If None, normal attention is used. + ## Transformer Engine # fp8 training is currently not supported in the improved models transformer_engine: False @@ -126,6 +131,7 @@ model: fp8_interval: 1 # scaling update interval fp8_amax_history_len: 1024 # Number of steps for which amax history is recorded per tensor fp8_amax_compute_algo: max # 'most_recent' or 'max'. Algorithm for computing amax from history + fp8_wgrad: True use_emha: False ub_tp_comm_overlap: False diff --git a/launcher_scripts/conf/training/gpt3/5b.yaml b/launcher_scripts/conf/training/gpt3/5b.yaml index ae99d3e063..0ba8d80b89 100755 --- a/launcher_scripts/conf/training/gpt3/5b.yaml +++ b/launcher_scripts/conf/training/gpt3/5b.yaml @@ -63,6 +63,7 @@ model: num_attention_heads: 32 init_method_std: 0.01 # Standard deviation of the zero mean normal distribution used for weight initialization.') hidden_dropout: 0.1 # Dropout probability for hidden state transformer. + attention_dropout: 0.1 # Dropout probability for attention kv_channels: null # Projection weights dimension in multi-head attention. Set to hidden_size // num_attention_heads if null apply_query_key_layer_scaling: True # scale Q * K^T by 1 / layer-number. layernorm_epsilon: 1e-5 @@ -89,6 +90,10 @@ model: ## Sequence Parallelism sequence_parallel: False + overlap_p2p_comm: False # Overlap p2p communication with computes. This argument is valid only when `virtual_pipeline_model_parallel_size` is larger than 1 + batch_p2p_comm: True # Batch consecutive inter-peer send/recv operations. This argument is valid only when `virtual_pipeline_model_parallel_size` is larger than 1 + num_query_groups: null # Number of query groups for group query attention. If None, normal attention is used. + tokenizer: library: 'megatron' type: 'GPT2BPETokenizer' @@ -119,6 +124,7 @@ model: fp8_interval: 1 # scaling update interval fp8_amax_history_len: 1024 # Number of steps for which amax history is recorded per tensor fp8_amax_compute_algo: max # 'most_recent' or 'max'. Algorithm for computing amax from history + fp8_wgrad: True use_emha: False ub_tp_comm_overlap: False diff --git a/launcher_scripts/conf/training/gpt3/7b_improved.yaml b/launcher_scripts/conf/training/gpt3/7b_improved.yaml index 0eec1b43ba..8cd14cad59 100644 --- a/launcher_scripts/conf/training/gpt3/7b_improved.yaml +++ b/launcher_scripts/conf/training/gpt3/7b_improved.yaml @@ -66,7 +66,8 @@ model: ffn_dropout: 0.0 kv_channels: null apply_query_key_layer_scaling: true - normalization: layernorm1p + normalization: LayerNorm + layernorm_zero_centered_gamma: True layernorm_epsilon: 1.0e-05 do_layer_norm_weight_decay: false make_vocab_size_divisible_by: 128 @@ -116,6 +117,10 @@ model: activations_checkpoint_layers_per_pipeline: null sequence_parallel: false # does not support sequence parallel + overlap_p2p_comm: False # Overlap p2p communication with computes. This argument is valid only when `virtual_pipeline_model_parallel_size` is larger than 1 + batch_p2p_comm: True # Batch consecutive inter-peer send/recv operations. This argument is valid only when `virtual_pipeline_model_parallel_size` is larger than 1 + num_query_groups: null # Number of query groups for group query attention. If None, normal attention is used. + ## Transformer Engine # fp8 training is currently not supported in the improved models transformer_engine: False @@ -126,6 +131,7 @@ model: fp8_interval: 1 # scaling update interval fp8_amax_history_len: 1024 # Number of steps for which amax history is recorded per tensor fp8_amax_compute_algo: max # 'most_recent' or 'max'. Algorithm for computing amax from history + fp8_wgrad: True use_emha: False ub_tp_comm_overlap: False From 5ee14f1730562e3b0a97b2583301a691a8231e1b Mon Sep 17 00:00:00 2001 From: Dmytro Pykhtar <37850217+dimapihtar@users.noreply.github.com> Date: Tue, 5 Sep 2023 22:23:08 +0300 Subject: [PATCH 39/62] Update config.yaml --- launcher_scripts/conf/config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/launcher_scripts/conf/config.yaml b/launcher_scripts/conf/config.yaml index d172a06c98..523edfff11 100755 --- a/launcher_scripts/conf/config.yaml +++ b/launcher_scripts/conf/config.yaml @@ -45,7 +45,7 @@ env_vars: NCCL_DEBUG: null # Logging level for NCCL. Set to "INFO" for debug information NCCL_PROTO: null # Protocol NCCL will use. Set to "simple" for AWS TRANSFORMERS_OFFLINE: 1 - NCCL_AVOID_RECORD_STREAMS: 1 + TORCH_NCCL_AVOID_RECORD_STREAMS: 1 # GPU Mapping numa_mapping: From a95eb7167b3a68ee99b0e49125676f5571790cf6 Mon Sep 17 00:00:00 2001 From: Hongbin Liu Date: Wed, 6 Sep 2023 02:19:02 -0700 Subject: [PATCH 40/62] update llama training scripts Signed-off-by: Hongbin Liu --- .../llama/{13b.yaml => llama1_13b.yaml} | 15 +- .../llama/{30b.yaml => llama1_30b.yaml} | 16 +- .../llama/{65b.yaml => llama1_65b.yaml} | 17 +-- .../llama/{7b.yaml => llama1_7b.yaml} | 11 +- .../conf/training/llama/llama2_13b.yaml | 13 +- .../conf/training/llama/llama2_70b.yaml | 15 +- .../conf/training/llama/llama2_7b.yaml | 137 +++++++++--------- 7 files changed, 109 insertions(+), 115 deletions(-) rename launcher_scripts/conf/training/llama/{13b.yaml => llama1_13b.yaml} (96%) rename launcher_scripts/conf/training/llama/{30b.yaml => llama1_30b.yaml} (94%) rename launcher_scripts/conf/training/llama/{65b.yaml => llama1_65b.yaml} (94%) rename launcher_scripts/conf/training/llama/{7b.yaml => llama1_7b.yaml} (97%) diff --git a/launcher_scripts/conf/training/llama/13b.yaml b/launcher_scripts/conf/training/llama/llama1_13b.yaml similarity index 96% rename from launcher_scripts/conf/training/llama/13b.yaml rename to launcher_scripts/conf/training/llama/llama1_13b.yaml index e06835be27..2d71dadb3a 100644 --- a/launcher_scripts/conf/training/llama/13b.yaml +++ b/launcher_scripts/conf/training/llama/llama1_13b.yaml @@ -4,7 +4,7 @@ run: time_limit: 0-02:00:00 dependency: singleton trainer: - num_nodes: 4 + num_nodes: 32 devices: 8 accelerator: gpu precision: bf16 @@ -20,7 +20,6 @@ trainer: limit_test_batches: 50 accumulate_grad_batches: 1 gradient_clip_val: 1.0 - num_sanity_val_steps: 0 exp_manager: explicit_log_dir: ${training.run.results_dir}/results exp_dir: null @@ -48,9 +47,9 @@ exp_manager: model: mcore_gpt: true micro_batch_size: 2 - global_batch_size: 128 + global_batch_size: 2048 rampup_batch_size: null - tensor_model_parallel_size: 4 + tensor_model_parallel_size: 2 pipeline_model_parallel_size: 1 virtual_pipeline_model_parallel_size: null encoder_seq_length: 2048 @@ -110,12 +109,12 @@ model: apex_transformer_log_level: 30 gradient_as_bucket_view: true sync_batch_comm: false - activations_checkpoint_granularity: selective + activations_checkpoint_granularity: null activations_checkpoint_method: block activations_checkpoint_num_layers: 0 num_micro_batches_with_partial_activation_checkpoints: 0 activations_checkpoint_layers_per_pipeline: 0 - sequence_parallel: false + sequence_parallel: true transformer_engine: true fp8: false fp8_e4m3: false @@ -126,7 +125,7 @@ model: fp8_amax_compute_algo: most_recent use_emha: false ub_tp_comm_overlap: false - use_flash_attention: false + use_flash_attention: true optim: name: distributed_fused_adam lr: 0.0001 @@ -135,7 +134,7 @@ model: - 0.9 - 0.95 bucket_cap_mb: 125 - overlap_grad_sync: false + overlap_grad_sync: true sched: name: CosineAnnealing warmup_steps: 107 diff --git a/launcher_scripts/conf/training/llama/30b.yaml b/launcher_scripts/conf/training/llama/llama1_30b.yaml similarity index 94% rename from launcher_scripts/conf/training/llama/30b.yaml rename to launcher_scripts/conf/training/llama/llama1_30b.yaml index ebdfa06f02..43cc420c46 100644 --- a/launcher_scripts/conf/training/llama/30b.yaml +++ b/launcher_scripts/conf/training/llama/llama1_30b.yaml @@ -49,7 +49,7 @@ model: global_batch_size: 2048 rampup_batch_size: null tensor_model_parallel_size: 4 - pipeline_model_parallel_size: 4 + pipeline_model_parallel_size: 1 virtual_pipeline_model_parallel_size: null encoder_seq_length: 2048 max_position_embeddings: 2048 @@ -84,7 +84,7 @@ model: tokenizer: library: sentencepiece type: null - model: ${data_dir}/llama/llama_tokenizer.model + model: ${data_dir}/llama/llama_tokenizer.model delimiter: null vocab_file: null merge_file: null @@ -108,12 +108,12 @@ model: apex_transformer_log_level: 30 gradient_as_bucket_view: true sync_batch_comm: false - activations_checkpoint_granularity: selective + activations_checkpoint_granularity: null activations_checkpoint_method: block activations_checkpoint_num_layers: 0 - num_micro_batches_with_partial_activation_checkpoints: 2 - activations_checkpoint_layers_per_pipeline: 32 - sequence_parallel: false + num_micro_batches_with_partial_activation_checkpoints: 0 + activations_checkpoint_layers_per_pipeline: 0 + sequence_parallel: true transformer_engine: true fp8: false fp8_e4m3: false @@ -124,7 +124,7 @@ model: fp8_amax_compute_algo: most_recent use_emha: false ub_tp_comm_overlap: false - use_flash_attention: false + use_flash_attention: true optim: name: distributed_fused_adam lr: 0.0001 @@ -133,7 +133,7 @@ model: - 0.9 - 0.95 bucket_cap_mb: 125 - overlap_grad_sync: false + overlap_grad_sync: true sched: name: CosineAnnealing warmup_steps: 107 diff --git a/launcher_scripts/conf/training/llama/65b.yaml b/launcher_scripts/conf/training/llama/llama1_65b.yaml similarity index 94% rename from launcher_scripts/conf/training/llama/65b.yaml rename to launcher_scripts/conf/training/llama/llama1_65b.yaml index 0f65d40071..e61cf1ca59 100644 --- a/launcher_scripts/conf/training/llama/65b.yaml +++ b/launcher_scripts/conf/training/llama/llama1_65b.yaml @@ -4,7 +4,7 @@ run: time_limit: 0-01:00:00 dependency: singleton trainer: - num_nodes: 16 + num_nodes: 128 devices: 8 accelerator: gpu precision: bf16 @@ -20,7 +20,6 @@ trainer: limit_test_batches: 50 accumulate_grad_batches: 1 gradient_clip_val: 1.0 - num_sanity_val_steps: 0 exp_manager: explicit_log_dir: ${training.run.results_dir}/results exp_dir: null @@ -50,8 +49,8 @@ model: global_batch_size: 2048 rampup_batch_size: null tensor_model_parallel_size: 4 - pipeline_model_parallel_size: 8 - virtual_pipeline_model_parallel_size: 10 + pipeline_model_parallel_size: 4 + virtual_pipeline_model_parallel_size: 20 encoder_seq_length: 2048 max_position_embeddings: 2048 num_layers: 80 @@ -109,12 +108,12 @@ model: apex_transformer_log_level: 30 gradient_as_bucket_view: true sync_batch_comm: false - activations_checkpoint_granularity: selective + activations_checkpoint_granularity: null activations_checkpoint_method: block activations_checkpoint_num_layers: 0 - num_micro_batches_with_partial_activation_checkpoints: 80 + num_micro_batches_with_partial_activation_checkpoints: 0 activations_checkpoint_layers_per_pipeline: 0 - sequence_parallel: false + sequence_parallel: true transformer_engine: true fp8: false fp8_e4m3: false @@ -125,7 +124,7 @@ model: fp8_amax_compute_algo: most_recent use_emha: false ub_tp_comm_overlap: false - use_flash_attention: false + use_flash_attention: true optim: name: distributed_fused_adam lr: 0.0001 @@ -134,7 +133,7 @@ model: - 0.9 - 0.95 bucket_cap_mb: 125 - overlap_grad_sync: false + overlap_grad_sync: true sched: name: CosineAnnealing warmup_steps: 107 diff --git a/launcher_scripts/conf/training/llama/7b.yaml b/launcher_scripts/conf/training/llama/llama1_7b.yaml similarity index 97% rename from launcher_scripts/conf/training/llama/7b.yaml rename to launcher_scripts/conf/training/llama/llama1_7b.yaml index cc1bb32c15..090a0881a2 100644 --- a/launcher_scripts/conf/training/llama/7b.yaml +++ b/launcher_scripts/conf/training/llama/llama1_7b.yaml @@ -4,7 +4,7 @@ run: time_limit: "0-04:00:00" dependency: "singleton" trainer: - num_nodes: 2 + num_nodes: 16 devices: 8 accelerator: gpu precision: bf16 @@ -20,7 +20,6 @@ trainer: limit_test_batches: 50 accumulate_grad_batches: 1 gradient_clip_val: 1.0 - num_sanity_val_steps: 0 exp_manager: explicit_log_dir: ${training.run.results_dir}/results exp_dir: null @@ -48,9 +47,9 @@ exp_manager: model: mcore_gpt: true micro_batch_size: 2 - global_batch_size: 128 + global_batch_size: 2048 rampup_batch_size: null - tensor_model_parallel_size: 2 + tensor_model_parallel_size: 1 pipeline_model_parallel_size: 1 virtual_pipeline_model_parallel_size: null encoder_seq_length: 2048 @@ -110,7 +109,7 @@ model: apex_transformer_log_level: 30 gradient_as_bucket_view: true sync_batch_comm: false - activations_checkpoint_granularity: selective + activations_checkpoint_granularity: null activations_checkpoint_method: block activations_checkpoint_num_layers: 0 num_micro_batches_with_partial_activation_checkpoints: null @@ -138,7 +137,7 @@ model: - 0.9 - 0.95 bucket_cap_mb: 125 - overlap_grad_sync: False + overlap_grad_sync: true sched: name: CosineAnnealing warmup_steps: 500 diff --git a/launcher_scripts/conf/training/llama/llama2_13b.yaml b/launcher_scripts/conf/training/llama/llama2_13b.yaml index a77584a33d..1e1ff0c7d8 100644 --- a/launcher_scripts/conf/training/llama/llama2_13b.yaml +++ b/launcher_scripts/conf/training/llama/llama2_13b.yaml @@ -4,7 +4,7 @@ run: time_limit: 0-01:00:00 dependency: singleton trainer: - num_nodes: 4 + num_nodes: 32 devices: 8 accelerator: gpu precision: bf16 @@ -20,7 +20,6 @@ trainer: limit_test_batches: 50 accumulate_grad_batches: 1 gradient_clip_val: 1.0 - num_sanity_val_steps: 0 exp_manager: explicit_log_dir: ${training.run.results_dir}/results exp_dir: null @@ -47,10 +46,10 @@ exp_manager: buffer_size: 5 model: mcore_gpt: true - micro_batch_size: 2 - global_batch_size: 128 + micro_batch_size: 1 + global_batch_size: 2048 rampup_batch_size: null - tensor_model_parallel_size: 4 + tensor_model_parallel_size: 2 pipeline_model_parallel_size: 1 virtual_pipeline_model_parallel_size: null encoder_seq_length: 4096 @@ -110,7 +109,7 @@ model: apex_transformer_log_level: 30 gradient_as_bucket_view: true sync_batch_comm: false - activations_checkpoint_granularity: selective + activations_checkpoint_granularity: null activations_checkpoint_method: block activations_checkpoint_num_layers: 0 num_micro_batches_with_partial_activation_checkpoints: 0 @@ -135,7 +134,7 @@ model: - 0.9 - 0.95 bucket_cap_mb: 125 - overlap_grad_sync: false + overlap_grad_sync: true sched: name: CosineAnnealing warmup_steps: 107 diff --git a/launcher_scripts/conf/training/llama/llama2_70b.yaml b/launcher_scripts/conf/training/llama/llama2_70b.yaml index 7697b36e0f..5cee07a21a 100644 --- a/launcher_scripts/conf/training/llama/llama2_70b.yaml +++ b/launcher_scripts/conf/training/llama/llama2_70b.yaml @@ -4,7 +4,7 @@ run: time_limit: 0-01:00:00 dependency: singleton trainer: - num_nodes: 8 + num_nodes: 128 devices: 8 accelerator: gpu precision: bf16 @@ -20,7 +20,6 @@ trainer: limit_test_batches: 50 accumulate_grad_batches: 1 gradient_clip_val: 1.0 - num_sanity_val_steps: 0 exp_manager: explicit_log_dir: ${training.run.results_dir}/results exp_dir: null @@ -47,11 +46,11 @@ exp_manager: model: mcore_gpt: true micro_batch_size: 1 - global_batch_size: 128 + global_batch_size: 2048 rampup_batch_size: null tensor_model_parallel_size: 4 pipeline_model_parallel_size: 4 - virtual_pipeline_model_parallel_size: null + virtual_pipeline_model_parallel_size: 20 encoder_seq_length: 4096 max_position_embeddings: 4096 num_layers: 80 @@ -110,9 +109,9 @@ model: apex_transformer_log_level: 30 gradient_as_bucket_view: true sync_batch_comm: false - activations_checkpoint_granularity: selective + activations_checkpoint_granularity: null activations_checkpoint_method: block - activations_checkpoint_num_layers: 1 + activations_checkpoint_num_layers: 0 num_micro_batches_with_partial_activation_checkpoints: 0 activations_checkpoint_layers_per_pipeline: 0 sequence_parallel: true @@ -127,8 +126,8 @@ model: use_emha: false ub_tp_comm_overlap: false use_flash_attention: true - overlap_p2p_comm: false - batch_p2p_comm: true + overlap_p2p_comm: true + batch_p2p_comm: false gc_interval: 100 optim: name: distributed_fused_adam diff --git a/launcher_scripts/conf/training/llama/llama2_7b.yaml b/launcher_scripts/conf/training/llama/llama2_7b.yaml index dcc2887bcf..87bac9f902 100644 --- a/launcher_scripts/conf/training/llama/llama2_7b.yaml +++ b/launcher_scripts/conf/training/llama/llama2_7b.yaml @@ -1,10 +1,10 @@ run: name: llama2_7b results_dir: ${base_results_dir}/${.name} - time_limit: "0-01:00:00" + time_limit: "0-01:30:00" dependency: "singleton" trainer: - num_nodes: 2 + num_nodes: 16 devices: 8 accelerator: gpu precision: bf16 @@ -20,7 +20,6 @@ trainer: limit_test_batches: 50 accumulate_grad_batches: 1 gradient_clip_val: 1.0 - num_sanity_val_steps: 0 exp_manager: explicit_log_dir: ${training.run.results_dir}/results exp_dir: null @@ -47,10 +46,10 @@ exp_manager: model: mcore_gpt: true - micro_batch_size: 2 - global_batch_size: 128 + micro_batch_size: 1 + global_batch_size: 2048 rampup_batch_size: null - tensor_model_parallel_size: 2 + tensor_model_parallel_size: 1 pipeline_model_parallel_size: 1 virtual_pipeline_model_parallel_size: null encoder_seq_length: 4096 @@ -110,12 +109,12 @@ model: apex_transformer_log_level: 30 gradient_as_bucket_view: true sync_batch_comm: false - activations_checkpoint_granularity: selective + activations_checkpoint_granularity: null activations_checkpoint_method: block activations_checkpoint_num_layers: 0 num_micro_batches_with_partial_activation_checkpoints: null activations_checkpoint_layers_per_pipeline: null - sequence_parallel: true # does not support sequence parallel + sequence_parallel: false ## Transformer Engine # fp8 training is currently not supported in the improved models @@ -138,7 +137,7 @@ model: - 0.9 - 0.95 bucket_cap_mb: 125 - overlap_grad_sync: False + overlap_grad_sync: true sched: name: CosineAnnealing warmup_steps: 500 @@ -160,64 +159,64 @@ model: - ${data_dir}/my-llama_00_text_document - .0333 - ${data_dir}/my-llama_01_text_document - - .0333 - - ${data_dir}/my-llama_00_text_document - - .0333 - - ${data_dir}/my-llama_01_text_document - - .0333 - - ${data_dir}/my-llama_02_text_document - - .0333 - - ${data_dir}/my-llama_03_text_document - - .0333 - - ${data_dir}/my-llama_04_text_document - - .0333 - - ${data_dir}/my-llama_05_text_document - - .0333 - - ${data_dir}/my-llama_06_text_document - - .0333 - - ${data_dir}/my-llama_07_text_document - - .0333 - - ${data_dir}/my-llama_08_text_document - - .0333 - - ${data_dir}/my-llama_09_text_document - - .0333 - - ${data_dir}/my-llama_10_text_document - - .0333 - - ${data_dir}/my-llama_11_text_document - - .0333 - - ${data_dir}/my-llama_12_text_document - - .0333 - - ${data_dir}/my-llama_13_text_document - - .0333 - - ${data_dir}/my-llama_14_text_document - - .0333 - - ${data_dir}/my-llama_15_text_document - - .0333 - - ${data_dir}/my-llama_16_text_document - - .0333 - - ${data_dir}/my-llama_17_text_document - - .0333 - - ${data_dir}/my-llama_18_text_document - - .0333 - - ${data_dir}/my-llama_19_text_document - - .0333 - - ${data_dir}/my-llama_20_text_document - - .0333 - - ${data_dir}/my-llama_21_text_document - - .0333 - - ${data_dir}/my-llama_22_text_document - - .0333 - - ${data_dir}/my-llama_23_text_document - - .0333 - - ${data_dir}/my-llama_24_text_document - - .0333 - - ${data_dir}/my-llama_25_text_document - - .0333 - - ${data_dir}/my-llama_26_text_document - - .0333 - - ${data_dir}/my-llama_27_text_document - - .0333 - - ${data_dir}/my-llama_28_text_document - - .0334 - - ${data_dir}/my-llama_29_text_document + #- .0333 + #- ${data_dir}/my-llama_00_text_document + #- .0333 + #- ${data_dir}/my-llama_01_text_document + #- .0333 + #- ${data_dir}/my-llama_02_text_document + #- .0333 + #- ${data_dir}/my-llama_03_text_document + #- .0333 + #- ${data_dir}/my-llama_04_text_document + #- .0333 + #- ${data_dir}/my-llama_05_text_document + #- .0333 + #- ${data_dir}/my-llama_06_text_document + #- .0333 + #- ${data_dir}/my-llama_07_text_document + #- .0333 + #- ${data_dir}/my-llama_08_text_document + #- .0333 + #- ${data_dir}/my-llama_09_text_document + #- .0333 + #- ${data_dir}/my-llama_10_text_document + #- .0333 + #- ${data_dir}/my-llama_11_text_document + #- .0333 + #- ${data_dir}/my-llama_12_text_document + #- .0333 + #- ${data_dir}/my-llama_13_text_document + #- .0333 + #- ${data_dir}/my-llama_14_text_document + #- .0333 + #- ${data_dir}/my-llama_15_text_document + #- .0333 + #- ${data_dir}/my-llama_16_text_document + #- .0333 + #- ${data_dir}/my-llama_17_text_document + #- .0333 + #- ${data_dir}/my-llama_18_text_document + #- .0333 + #- ${data_dir}/my-llama_19_text_document + #- .0333 + #- ${data_dir}/my-llama_20_text_document + #- .0333 + #- ${data_dir}/my-llama_21_text_document + #- .0333 + #- ${data_dir}/my-llama_22_text_document + #- .0333 + #- ${data_dir}/my-llama_23_text_document + #- .0333 + #- ${data_dir}/my-llama_24_text_document + #- .0333 + #- ${data_dir}/my-llama_25_text_document + #- .0333 + #- ${data_dir}/my-llama_26_text_document + #- .0333 + #- ${data_dir}/my-llama_27_text_document + #- .0333 + #- ${data_dir}/my-llama_28_text_document + #- .0334 + #- ${data_dir}/my-llama_29_text_document From 45b9f41037d79a81475e106a6bd98cd0114399f6 Mon Sep 17 00:00:00 2001 From: Hongbin Liu Date: Wed, 6 Sep 2023 02:22:00 -0700 Subject: [PATCH 41/62] update data preparation script for llama Signed-off-by: Hongbin Liu --- .../conf/data_preparation/llama/download_llama_pile.yaml | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/launcher_scripts/conf/data_preparation/llama/download_llama_pile.yaml b/launcher_scripts/conf/data_preparation/llama/download_llama_pile.yaml index 863f817661..ab317e8a8d 100755 --- a/launcher_scripts/conf/data_preparation/llama/download_llama_pile.yaml +++ b/launcher_scripts/conf/data_preparation/llama/download_llama_pile.yaml @@ -10,14 +10,9 @@ run: dataset: pile download_the_pile: True # Whether to download the pile dataset from the internet. the_pile_url: "https://the-eye.eu/public/AI/pile/train/" # Source URL to download The Pile dataset from. -file_numbers: "0-1" # The pile dataset consists of 30 files (0-29), choose which ones to download. +file_numbers: "0-29" # The pile dataset consists of 30 files (0-29), choose which ones to download. preprocess_data: True # True to preprocess the data from a jsonl file, False otherwise. download_tokenizer_url: "https://huggingface.co/decapoda-research/llama-7b-hf/resolve/main/tokenizer.model" -#download_vocab_url: "https://huggingface.co/gpt2/resolve/main/vocab.json" # URL to download the vocab from. -#download_merges_url: "https://huggingface.co/gpt2/resolve/main/merges.txt" # URL to download the merges from. -#vocab_save_dir: ${data_dir}/bpe -#merges_save_dir: ${data_dir}/bpe -#tokenizer_type: GPT2BPETokenizer tokenizer_library: "sentencepiece" tokenizer_save_dir: ${data_dir}/llama tokenizer_model: ${.tokenizer_save_dir}/llama_tokenizer.model From 702ea64ee603b64b6fc97de0f2a6d5968cf2893d Mon Sep 17 00:00:00 2001 From: Hongbin Liu Date: Wed, 6 Sep 2023 04:03:56 -0700 Subject: [PATCH 42/62] update llama training scripts Signed-off-by: Hongbin Liu --- .../conf/training/llama/llama1_13b.yaml | 6 +- .../conf/training/llama/llama1_30b.yaml | 8 +- .../conf/training/llama/llama1_65b.yaml | 6 +- .../conf/training/llama/llama1_7b.yaml | 6 +- .../conf/training/llama/llama2_13b.yaml | 6 +- .../conf/training/llama/llama2_70b.yaml | 4 - .../conf/training/llama/llama2_7b.yaml | 118 +++++++++--------- 7 files changed, 69 insertions(+), 85 deletions(-) diff --git a/launcher_scripts/conf/training/llama/llama1_13b.yaml b/launcher_scripts/conf/training/llama/llama1_13b.yaml index 2d71dadb3a..3c2fd60daf 100644 --- a/launcher_scripts/conf/training/llama/llama1_13b.yaml +++ b/launcher_scripts/conf/training/llama/llama1_13b.yaml @@ -135,6 +135,8 @@ model: - 0.95 bucket_cap_mb: 125 overlap_grad_sync: true + overlap_param_sync: true + contiguous_grad_buffer: true sched: name: CosineAnnealing warmup_steps: 107 @@ -157,10 +159,6 @@ model: - .0333 - ${data_dir}/my-llama_01_text_document - .0333 - - ${data_dir}/my-llama_00_text_document - - .0333 - - ${data_dir}/my-llama_01_text_document - - .0333 - ${data_dir}/my-llama_02_text_document - .0333 - ${data_dir}/my-llama_03_text_document diff --git a/launcher_scripts/conf/training/llama/llama1_30b.yaml b/launcher_scripts/conf/training/llama/llama1_30b.yaml index 43cc420c46..93dee04071 100644 --- a/launcher_scripts/conf/training/llama/llama1_30b.yaml +++ b/launcher_scripts/conf/training/llama/llama1_30b.yaml @@ -4,7 +4,7 @@ run: time_limit: 0-01:00:00 dependency: singleton trainer: - num_nodes: 16 + num_nodes: 32 devices: 8 accelerator: gpu precision: bf16 @@ -134,6 +134,8 @@ model: - 0.95 bucket_cap_mb: 125 overlap_grad_sync: true + overlap_param_sync: true + contiguous_grad_buffer: true sched: name: CosineAnnealing warmup_steps: 107 @@ -156,10 +158,6 @@ model: - .0333 - ${data_dir}/my-llama_01_text_document - .0333 - - ${data_dir}/my-llama_00_text_document - - .0333 - - ${data_dir}/my-llama_01_text_document - - .0333 - ${data_dir}/my-llama_02_text_document - .0333 - ${data_dir}/my-llama_03_text_document diff --git a/launcher_scripts/conf/training/llama/llama1_65b.yaml b/launcher_scripts/conf/training/llama/llama1_65b.yaml index e61cf1ca59..d39259caae 100644 --- a/launcher_scripts/conf/training/llama/llama1_65b.yaml +++ b/launcher_scripts/conf/training/llama/llama1_65b.yaml @@ -134,6 +134,8 @@ model: - 0.95 bucket_cap_mb: 125 overlap_grad_sync: true + overlap_param_sync: true + contiguous_grad_buffer: true sched: name: CosineAnnealing warmup_steps: 107 @@ -156,10 +158,6 @@ model: - .0333 - ${data_dir}/my-llama_01_text_document - .0333 - - ${data_dir}/my-llama_00_text_document - - .0333 - - ${data_dir}/my-llama_01_text_document - - .0333 - ${data_dir}/my-llama_02_text_document - .0333 - ${data_dir}/my-llama_03_text_document diff --git a/launcher_scripts/conf/training/llama/llama1_7b.yaml b/launcher_scripts/conf/training/llama/llama1_7b.yaml index 090a0881a2..a8acb21e7d 100644 --- a/launcher_scripts/conf/training/llama/llama1_7b.yaml +++ b/launcher_scripts/conf/training/llama/llama1_7b.yaml @@ -138,6 +138,8 @@ model: - 0.95 bucket_cap_mb: 125 overlap_grad_sync: true + overlap_param_sync: true + contiguous_grad_buffer: true sched: name: CosineAnnealing warmup_steps: 500 @@ -160,10 +162,6 @@ model: - .0333 - ${data_dir}/my-llama_01_text_document - .0333 - - ${data_dir}/my-llama_00_text_document - - .0333 - - ${data_dir}/my-llama_01_text_document - - .0333 - ${data_dir}/my-llama_02_text_document - .0333 - ${data_dir}/my-llama_03_text_document diff --git a/launcher_scripts/conf/training/llama/llama2_13b.yaml b/launcher_scripts/conf/training/llama/llama2_13b.yaml index 1e1ff0c7d8..3d4dc8d0b1 100644 --- a/launcher_scripts/conf/training/llama/llama2_13b.yaml +++ b/launcher_scripts/conf/training/llama/llama2_13b.yaml @@ -135,6 +135,8 @@ model: - 0.95 bucket_cap_mb: 125 overlap_grad_sync: true + overlap_param_sync: true + contiguous_grad_buffer: true sched: name: CosineAnnealing warmup_steps: 107 @@ -157,10 +159,6 @@ model: - .0333 - ${data_dir}/my-llama_01_text_document - .0333 - - ${data_dir}/my-llama_00_text_document - - .0333 - - ${data_dir}/my-llama_01_text_document - - .0333 - ${data_dir}/my-llama_02_text_document - .0333 - ${data_dir}/my-llama_03_text_document diff --git a/launcher_scripts/conf/training/llama/llama2_70b.yaml b/launcher_scripts/conf/training/llama/llama2_70b.yaml index 5cee07a21a..0beb5f8bca 100644 --- a/launcher_scripts/conf/training/llama/llama2_70b.yaml +++ b/launcher_scripts/conf/training/llama/llama2_70b.yaml @@ -162,10 +162,6 @@ model: - .0333 - ${data_dir}/my-llama_01_text_document - .0333 - - ${data_dir}/my-llama_00_text_document - - .0333 - - ${data_dir}/my-llama_01_text_document - - .0333 - ${data_dir}/my-llama_02_text_document - .0333 - ${data_dir}/my-llama_03_text_document diff --git a/launcher_scripts/conf/training/llama/llama2_7b.yaml b/launcher_scripts/conf/training/llama/llama2_7b.yaml index 87bac9f902..7df5de9940 100644 --- a/launcher_scripts/conf/training/llama/llama2_7b.yaml +++ b/launcher_scripts/conf/training/llama/llama2_7b.yaml @@ -138,6 +138,8 @@ model: - 0.95 bucket_cap_mb: 125 overlap_grad_sync: true + overlap_param_sync: true + contiguous_grad_buffer: true sched: name: CosineAnnealing warmup_steps: 500 @@ -159,64 +161,60 @@ model: - ${data_dir}/my-llama_00_text_document - .0333 - ${data_dir}/my-llama_01_text_document - #- .0333 - #- ${data_dir}/my-llama_00_text_document - #- .0333 - #- ${data_dir}/my-llama_01_text_document - #- .0333 - #- ${data_dir}/my-llama_02_text_document - #- .0333 - #- ${data_dir}/my-llama_03_text_document - #- .0333 - #- ${data_dir}/my-llama_04_text_document - #- .0333 - #- ${data_dir}/my-llama_05_text_document - #- .0333 - #- ${data_dir}/my-llama_06_text_document - #- .0333 - #- ${data_dir}/my-llama_07_text_document - #- .0333 - #- ${data_dir}/my-llama_08_text_document - #- .0333 - #- ${data_dir}/my-llama_09_text_document - #- .0333 - #- ${data_dir}/my-llama_10_text_document - #- .0333 - #- ${data_dir}/my-llama_11_text_document - #- .0333 - #- ${data_dir}/my-llama_12_text_document - #- .0333 - #- ${data_dir}/my-llama_13_text_document - #- .0333 - #- ${data_dir}/my-llama_14_text_document - #- .0333 - #- ${data_dir}/my-llama_15_text_document - #- .0333 - #- ${data_dir}/my-llama_16_text_document - #- .0333 - #- ${data_dir}/my-llama_17_text_document - #- .0333 - #- ${data_dir}/my-llama_18_text_document - #- .0333 - #- ${data_dir}/my-llama_19_text_document - #- .0333 - #- ${data_dir}/my-llama_20_text_document - #- .0333 - #- ${data_dir}/my-llama_21_text_document - #- .0333 - #- ${data_dir}/my-llama_22_text_document - #- .0333 - #- ${data_dir}/my-llama_23_text_document - #- .0333 - #- ${data_dir}/my-llama_24_text_document - #- .0333 - #- ${data_dir}/my-llama_25_text_document - #- .0333 - #- ${data_dir}/my-llama_26_text_document - #- .0333 - #- ${data_dir}/my-llama_27_text_document - #- .0333 - #- ${data_dir}/my-llama_28_text_document - #- .0334 - #- ${data_dir}/my-llama_29_text_document + - .0333 + - ${data_dir}/my-llama_02_text_document + - .0333 + - ${data_dir}/my-llama_03_text_document + - .0333 + - ${data_dir}/my-llama_04_text_document + - .0333 + - ${data_dir}/my-llama_05_text_document + - .0333 + - ${data_dir}/my-llama_06_text_document + - .0333 + - ${data_dir}/my-llama_07_text_document + - .0333 + - ${data_dir}/my-llama_08_text_document + - .0333 + - ${data_dir}/my-llama_09_text_document + - .0333 + - ${data_dir}/my-llama_10_text_document + - .0333 + - ${data_dir}/my-llama_11_text_document + - .0333 + - ${data_dir}/my-llama_12_text_document + - .0333 + - ${data_dir}/my-llama_13_text_document + - .0333 + - ${data_dir}/my-llama_14_text_document + - .0333 + - ${data_dir}/my-llama_15_text_document + - .0333 + - ${data_dir}/my-llama_16_text_document + - .0333 + - ${data_dir}/my-llama_17_text_document + - .0333 + - ${data_dir}/my-llama_18_text_document + - .0333 + - ${data_dir}/my-llama_19_text_document + - .0333 + - ${data_dir}/my-llama_20_text_document + - .0333 + - ${data_dir}/my-llama_21_text_document + - .0333 + - ${data_dir}/my-llama_22_text_document + - .0333 + - ${data_dir}/my-llama_23_text_document + - .0333 + - ${data_dir}/my-llama_24_text_document + - .0333 + - ${data_dir}/my-llama_25_text_document + - .0333 + - ${data_dir}/my-llama_26_text_document + - .0333 + - ${data_dir}/my-llama_27_text_document + - .0333 + - ${data_dir}/my-llama_28_text_document + - .0334 + - ${data_dir}/my-llama_29_text_document From 4a5c91e19b4447edf18fade4708533493100032d Mon Sep 17 00:00:00 2001 From: Hongbin Liu Date: Wed, 6 Sep 2023 04:08:54 -0700 Subject: [PATCH 43/62] update llama2 config in auto configurator Signed-off-by: Hongbin Liu --- .../base_configs/llama2_13b.yaml | 13 +- .../base_configs/llama2_70b.yaml | 11 +- auto_configurator/base_configs/llama2_7b.yaml | 130 +++++++++--------- 3 files changed, 70 insertions(+), 84 deletions(-) diff --git a/auto_configurator/base_configs/llama2_13b.yaml b/auto_configurator/base_configs/llama2_13b.yaml index a77584a33d..b3f20fd0c2 100644 --- a/auto_configurator/base_configs/llama2_13b.yaml +++ b/auto_configurator/base_configs/llama2_13b.yaml @@ -4,7 +4,7 @@ run: time_limit: 0-01:00:00 dependency: singleton trainer: - num_nodes: 4 + num_nodes: 2 devices: 8 accelerator: gpu precision: bf16 @@ -20,7 +20,6 @@ trainer: limit_test_batches: 50 accumulate_grad_batches: 1 gradient_clip_val: 1.0 - num_sanity_val_steps: 0 exp_manager: explicit_log_dir: ${training.run.results_dir}/results exp_dir: null @@ -37,7 +36,7 @@ exp_manager: save_top_k: 10 mode: min always_save_nemo: false - save_nemo_on_train_end: true + save_nemo_on_train_end: false filename: megatron_llama--{val_loss:.2f}-{step}-{consumed_samples} model_parallel_size: ${multiply:${training.model.tensor_model_parallel_size}, ${training.model.pipeline_model_parallel_size}} @@ -50,7 +49,7 @@ model: micro_batch_size: 2 global_batch_size: 128 rampup_batch_size: null - tensor_model_parallel_size: 4 + tensor_model_parallel_size: 2 pipeline_model_parallel_size: 1 virtual_pipeline_model_parallel_size: null encoder_seq_length: 4096 @@ -110,7 +109,7 @@ model: apex_transformer_log_level: 30 gradient_as_bucket_view: true sync_batch_comm: false - activations_checkpoint_granularity: selective + activations_checkpoint_granularity: null activations_checkpoint_method: block activations_checkpoint_num_layers: 0 num_micro_batches_with_partial_activation_checkpoints: 0 @@ -158,10 +157,6 @@ model: - .0333 - ${data_dir}/my-llama_01_text_document - .0333 - - ${data_dir}/my-llama_00_text_document - - .0333 - - ${data_dir}/my-llama_01_text_document - - .0333 - ${data_dir}/my-llama_02_text_document - .0333 - ${data_dir}/my-llama_03_text_document diff --git a/auto_configurator/base_configs/llama2_70b.yaml b/auto_configurator/base_configs/llama2_70b.yaml index 7697b36e0f..f624f5bb03 100644 --- a/auto_configurator/base_configs/llama2_70b.yaml +++ b/auto_configurator/base_configs/llama2_70b.yaml @@ -20,7 +20,6 @@ trainer: limit_test_batches: 50 accumulate_grad_batches: 1 gradient_clip_val: 1.0 - num_sanity_val_steps: 0 exp_manager: explicit_log_dir: ${training.run.results_dir}/results exp_dir: null @@ -37,7 +36,7 @@ exp_manager: save_top_k: 10 mode: min always_save_nemo: false - save_nemo_on_train_end: true + save_nemo_on_train_end: false filename: megatron_llama--{val_loss:.2f}-{step}-{consumed_samples} model_parallel_size: ${multiply:${training.model.tensor_model_parallel_size}, ${training.model.pipeline_model_parallel_size}} log_step_timing: true @@ -110,9 +109,9 @@ model: apex_transformer_log_level: 30 gradient_as_bucket_view: true sync_batch_comm: false - activations_checkpoint_granularity: selective + activations_checkpoint_granularity: null activations_checkpoint_method: block - activations_checkpoint_num_layers: 1 + activations_checkpoint_num_layers: 0 num_micro_batches_with_partial_activation_checkpoints: 0 activations_checkpoint_layers_per_pipeline: 0 sequence_parallel: true @@ -163,10 +162,6 @@ model: - .0333 - ${data_dir}/my-llama_01_text_document - .0333 - - ${data_dir}/my-llama_00_text_document - - .0333 - - ${data_dir}/my-llama_01_text_document - - .0333 - ${data_dir}/my-llama_02_text_document - .0333 - ${data_dir}/my-llama_03_text_document diff --git a/auto_configurator/base_configs/llama2_7b.yaml b/auto_configurator/base_configs/llama2_7b.yaml index 39222af385..95733d1f53 100755 --- a/auto_configurator/base_configs/llama2_7b.yaml +++ b/auto_configurator/base_configs/llama2_7b.yaml @@ -4,7 +4,7 @@ run: time_limit: "0-01:00:00" dependency: "singleton" trainer: - num_nodes: 2 + num_nodes: 1 devices: 8 accelerator: gpu precision: bf16 @@ -48,9 +48,9 @@ exp_manager: model: mcore_gpt: true micro_batch_size: 2 - global_batch_size: 64 + global_batch_size: 128 rampup_batch_size: null - tensor_model_parallel_size: 2 + tensor_model_parallel_size: 1 pipeline_model_parallel_size: 1 virtual_pipeline_model_parallel_size: null encoder_seq_length: 4096 @@ -110,12 +110,12 @@ model: apex_transformer_log_level: 30 gradient_as_bucket_view: true sync_batch_comm: false - activations_checkpoint_granularity: selective + activations_checkpoint_granularity: null activations_checkpoint_method: block activations_checkpoint_num_layers: 0 num_micro_batches_with_partial_activation_checkpoints: null activations_checkpoint_layers_per_pipeline: null - sequence_parallel: true # does not support sequence parallel + sequence_parallel: false # does not support sequence parallel ## Transformer Engine # fp8 training is currently not supported in the improved models @@ -156,68 +156,64 @@ model: eod_mask_loss: false index_mapping_dir: null data_prefix: - - .5 + - .0333 - ${data_dir}/my-llama_00_text_document - - .5 + - .0333 - ${data_dir}/my-llama_01_text_document - # - .0333 - # - ${data_dir}/my-gpt3_00_text_document - # - .0333 - # - ${data_dir}/my-gpt3_01_text_document - # - .0333 - # - ${data_dir}/my-gpt3_02_text_document - # - .0333 - # - ${data_dir}/my-gpt3_03_text_document - # - .0333 - # - ${data_dir}/my-gpt3_04_text_document - # - .0333 - # - ${data_dir}/my-gpt3_05_text_document - # - .0333 - # - ${data_dir}/my-gpt3_06_text_document - # - .0333 - # - ${data_dir}/my-gpt3_07_text_document - # - .0333 - # - ${data_dir}/my-gpt3_08_text_document - # - .0333 - # - ${data_dir}/my-gpt3_09_text_document - # - .0333 - # - ${data_dir}/my-gpt3_10_text_document - # - .0333 - # - ${data_dir}/my-gpt3_11_text_document - # - .0333 - # - ${data_dir}/my-gpt3_12_text_document - # - .0333 - # - ${data_dir}/my-gpt3_13_text_document - # - .0333 - # - ${data_dir}/my-gpt3_14_text_document - # - .0333 - # - ${data_dir}/my-gpt3_15_text_document - # - .0333 - # - ${data_dir}/my-gpt3_16_text_document - # - .0333 - # - ${data_dir}/my-gpt3_17_text_document - # - .0333 - # - ${data_dir}/my-gpt3_18_text_document - # - .0333 - # - ${data_dir}/my-gpt3_19_text_document - # - .0333 - # - ${data_dir}/my-gpt3_20_text_document - # - .0333 - # - ${data_dir}/my-gpt3_21_text_document - # - .0333 - # - ${data_dir}/my-gpt3_22_text_document - # - .0333 - # - ${data_dir}/my-gpt3_23_text_document - # - .0333 - # - ${data_dir}/my-gpt3_24_text_document - # - .0333 - # - ${data_dir}/my-gpt3_25_text_document - # - .0333 - # - ${data_dir}/my-gpt3_26_text_document - # - .0333 - # - ${data_dir}/my-gpt3_27_text_document - # - .0333 - # - ${data_dir}/my-gpt3_28_text_document - # - .0334 - # - ${data_dir}/my-gpt3_29_text_document + - .0333 + - ${data_dir}/my-llama_02_text_document + - .0333 + - ${data_dir}/my-llama_03_text_document + - .0333 + - ${data_dir}/my-llama_04_text_document + - .0333 + - ${data_dir}/my-llama_05_text_document + - .0333 + - ${data_dir}/my-llama_06_text_document + - .0333 + - ${data_dir}/my-llama_07_text_document + - .0333 + - ${data_dir}/my-llama_08_text_document + - .0333 + - ${data_dir}/my-llama_09_text_document + - .0333 + - ${data_dir}/my-llama_10_text_document + - .0333 + - ${data_dir}/my-llama_11_text_document + - .0333 + - ${data_dir}/my-llama_12_text_document + - .0333 + - ${data_dir}/my-llama_13_text_document + - .0333 + - ${data_dir}/my-llama_14_text_document + - .0333 + - ${data_dir}/my-llama_15_text_document + - .0333 + - ${data_dir}/my-llama_16_text_document + - .0333 + - ${data_dir}/my-llama_17_text_document + - .0333 + - ${data_dir}/my-llama_18_text_document + - .0333 + - ${data_dir}/my-llama_19_text_document + - .0333 + - ${data_dir}/my-llama_20_text_document + - .0333 + - ${data_dir}/my-llama_21_text_document + - .0333 + - ${data_dir}/my-llama_22_text_document + - .0333 + - ${data_dir}/my-llama_23_text_document + - .0333 + - ${data_dir}/my-llama_24_text_document + - .0333 + - ${data_dir}/my-llama_25_text_document + - .0333 + - ${data_dir}/my-llama_26_text_document + - .0333 + - ${data_dir}/my-llama_27_text_document + - .0333 + - ${data_dir}/my-llama_28_text_document + - .0334 + - ${data_dir}/my-llama_29_text_document From 485463966fe060b4dc70948018270f8128f76129 Mon Sep 17 00:00:00 2001 From: Hongbin Liu Date: Wed, 6 Sep 2023 04:13:21 -0700 Subject: [PATCH 44/62] add llama config in auto configurator Signed-off-by: Hongbin Liu --- .../conf/search_config/llama/13b.yaml | 31 ++++---------- .../conf/search_config/llama/70b.yaml | 21 +--------- .../conf/search_config/llama/7b.yaml | 26 ++---------- .../conf/search_config/llama/7b_nemo.yaml | 40 +++++++++++++++++++ 4 files changed, 53 insertions(+), 65 deletions(-) create mode 100644 auto_configurator/conf/search_config/llama/7b_nemo.yaml diff --git a/auto_configurator/conf/search_config/llama/13b.yaml b/auto_configurator/conf/search_config/llama/13b.yaml index e18a5f242c..0035e650bb 100644 --- a/auto_configurator/conf/search_config/llama/13b.yaml +++ b/auto_configurator/conf/search_config/llama/13b.yaml @@ -1,40 +1,23 @@ train_settings: model_size_in_b: 13 # unit in billion parameters - num_nodes: 4 + num_nodes: 2 gpus_per_node: 8 gpu_memory_gb: 80 # Memory per GPU, in GB. Currently 40GB and 80GB A100s supported. max_training_days: 5 # unit in days - limit_search_runs: 1 # Max number of runs to be launched in parallel for grid search. + limit_search_runs: 100 # Max number of runs to be launched in parallel for grid search. output_top_n: 10 # The result will print the top N fastest training configs. - max_steps_per_run: 50 # Max steps per run for the grid search. + max_steps_per_run: 100 # Max steps per run for the grid search. max_minutes_per_run: 30 # minutes per run for the grid search. tflops_per_gpu: 150 # Estimated tflops per GPU. num_tokens_in_b: 300 # Unit in billions, typically 300B for GPT3 models. vocab_size: 32000 seq_length: 4096 # available seq_length list for GPT-3 models: [2048, 4096, 8192, 16384, 32768] - custom_config: /lustre/fsw/devtech/hpc-devtech/hongbinl/nemo_megatron/scripts/support_llama/NeMo-Megatron-Launcher/auto_configurator/base_configs/llama2_13b.yaml # path to custom .yaml model config instead of using auto-generated + custom_config: {auto_configurator_path}/base_configs/llama2_13b.yaml # path to custom .yaml model config instead of using auto-generated logs: ${base_results_dir}/${search_config_value}_${.gpu_memory_gb}gb # Example base_results_dir/gpt3/126m - tensor_parallel_sizes: [2] # auto to use our recommendation, or a list, such as [1, 2, 4, 8] - pipeline_parallel_sizes: [1,2] # auto to use our recommendation, or a list, such as [1, 2, 4, 8, 10] + tensor_parallel_sizes: auto # auto to use our recommendation, or a list, such as [1, 2, 4, 8] + pipeline_parallel_sizes: auto # auto to use our recommendation, or a list, such as [1, 2, 4, 8, 10] min_model_parallel_size: auto # auto to use our recommendation, or a value for the minimum desired parallelism max_model_parallel_size: auto # auto to use our recommendation, or a value for the maximum desired parallelism - micro_batch_sizes: [1,2] # auto to use our recommendation, or a list, such as [1, 2, 4, 8, 16] + micro_batch_sizes: auto # auto to use our recommendation, or a list, such as [1, 2, 4, 8, 16] act_ckpt_layers: auto # auto to use our recommendation, or a list, such as [0, 1, 2, 3] -inference_settings: - run: - model_type: gpt3 - model_train_name: gpt3_5b - gpus_per_node: 8 - data_type: "fp16" # fp32|fp16|bf16 - time_limit: 0:30:00 - results_dir: ${base_results_dir}/${search_config_value}_${search_config.train_settings.gpu_memory_gb}gb - tensor_parallel_sizes: [1,2,4] - pipeline_parallel_sizes: [1,2] - benchmark: - input_len: 60 - output_len: 20 - batch_sizes: [4,8,16,32,64,128,256] - beam_width: 1 - topk: 4 - topp: 0.0 diff --git a/auto_configurator/conf/search_config/llama/70b.yaml b/auto_configurator/conf/search_config/llama/70b.yaml index eb2d089064..ee41a9ccda 100644 --- a/auto_configurator/conf/search_config/llama/70b.yaml +++ b/auto_configurator/conf/search_config/llama/70b.yaml @@ -6,13 +6,13 @@ train_settings: max_training_days: 5 # unit in days limit_search_runs: 100 # Max number of runs to be launched in parallel for grid search. output_top_n: 10 # The result will print the top N fastest training configs. - max_steps_per_run: 50 # Max steps per run for the grid search. + max_steps_per_run: 100 # Max steps per run for the grid search. max_minutes_per_run: 30 # minutes per run for the grid search. tflops_per_gpu: 150 # Estimated tflops per GPU. num_tokens_in_b: 300 # Unit in billions, typically 300B for GPT3 models. vocab_size: 32000 seq_length: 4096 # available seq_length list for GPT-3 models: [2048, 4096, 8192, 16384, 32768] - custom_config: /lustre/fsw/devtech/hpc-devtech/hongbinl/nemo_megatron/scripts/support_llama/NeMo-Megatron-Launcher/auto_configurator/base_configs/llama2_70b.yaml # path to custom .yaml model config instead of using auto-generated + custom_config: {auto_configurator_path}/base_configs/llama2_70b.yaml # path to custom .yaml model config instead of using auto-generated logs: ${base_results_dir}/${search_config_value}_${.gpu_memory_gb}gb # Example base_results_dir/gpt3/126m tensor_parallel_sizes: auto # auto to use our recommendation, or a list, such as [1, 2, 4, 8] pipeline_parallel_sizes: auto # auto to use our recommendation, or a list, such as [1, 2, 4, 8, 10] @@ -21,20 +21,3 @@ train_settings: micro_batch_sizes: auto # auto to use our recommendation, or a list, such as [1, 2, 4, 8, 16] act_ckpt_layers: auto # auto to use our recommendation, or a list, such as [0, 1, 2, 3] -inference_settings: - run: - model_type: gpt3 - model_train_name: gpt3_5b - gpus_per_node: 8 - data_type: "fp16" # fp32|fp16|bf16 - time_limit: 0:30:00 - results_dir: ${base_results_dir}/${search_config_value}_${search_config.train_settings.gpu_memory_gb}gb - tensor_parallel_sizes: [1,2,4] - pipeline_parallel_sizes: [1,2] - benchmark: - input_len: 60 - output_len: 20 - batch_sizes: [4,8,16,32,64,128,256] - beam_width: 1 - topk: 4 - topp: 0.0 diff --git a/auto_configurator/conf/search_config/llama/7b.yaml b/auto_configurator/conf/search_config/llama/7b.yaml index 148f12ff6c..bfe1756413 100644 --- a/auto_configurator/conf/search_config/llama/7b.yaml +++ b/auto_configurator/conf/search_config/llama/7b.yaml @@ -1,18 +1,18 @@ train_settings: model_size_in_b: 7 # unit in billion parameters - num_nodes: 2 + num_nodes: 1 gpus_per_node: 8 gpu_memory_gb: 80 # Memory per GPU, in GB. Currently 40GB and 80GB A100s supported. max_training_days: 5 # unit in days - limit_search_runs: 1 # Max number of runs to be launched in parallel for grid search. + limit_search_runs: 100 # Max number of runs to be launched in parallel for grid search. output_top_n: 10 # The result will print the top N fastest training configs. - max_steps_per_run: 50 # Max steps per run for the grid search. + max_steps_per_run: 100 # Max steps per run for the grid search. max_minutes_per_run: 30 # minutes per run for the grid search. tflops_per_gpu: 150 # Estimated tflops per GPU. num_tokens_in_b: 300 # Unit in billions, typically 300B for GPT3 models. vocab_size: 32000 seq_length: 4096 # available seq_length list for GPT-3 models: [2048, 4096, 8192, 16384, 32768] - custom_config: /lustre/fsw/devtech/hpc-devtech/hongbinl/nemo_megatron/scripts/support_llama/NeMo-Megatron-Launcher/auto_configurator/base_configs/llama2_7b.yaml # path to custom .yaml model config instead of using auto-generated + custom_config: {auto_configurator_path}/base_configs/llama2_7b.yaml # path to custom .yaml model config instead of using auto-generated logs: ${base_results_dir}/${search_config_value}_${.gpu_memory_gb}gb # Example base_results_dir/gpt3/126m tensor_parallel_sizes: auto # auto to use our recommendation, or a list, such as [1, 2, 4, 8] pipeline_parallel_sizes: auto # auto to use our recommendation, or a list, such as [1, 2, 4, 8, 10] @@ -20,21 +20,3 @@ train_settings: max_model_parallel_size: auto # auto to use our recommendation, or a value for the maximum desired parallelism micro_batch_sizes: auto # auto to use our recommendation, or a list, such as [1, 2, 4, 8, 16] act_ckpt_layers: auto # auto to use our recommendation, or a list, such as [0, 1, 2, 3] - -inference_settings: - run: - model_type: gpt3 - model_train_name: gpt3_5b - gpus_per_node: 8 - data_type: "fp16" # fp32|fp16|bf16 - time_limit: 0:30:00 - results_dir: ${base_results_dir}/${search_config_value}_${search_config.train_settings.gpu_memory_gb}gb - tensor_parallel_sizes: [1,2,4] - pipeline_parallel_sizes: [1,2] - benchmark: - input_len: 60 - output_len: 20 - batch_sizes: [4,8,16,32,64,128,256] - beam_width: 1 - topk: 4 - topp: 0.0 diff --git a/auto_configurator/conf/search_config/llama/7b_nemo.yaml b/auto_configurator/conf/search_config/llama/7b_nemo.yaml new file mode 100644 index 0000000000..aca9819929 --- /dev/null +++ b/auto_configurator/conf/search_config/llama/7b_nemo.yaml @@ -0,0 +1,40 @@ +train_settings: + model_size_in_b: 7 # unit in billion parameters + num_nodes: 2 + gpus_per_node: 8 + gpu_memory_gb: 80 # Memory per GPU, in GB. Currently 40GB and 80GB A100s supported. + max_training_days: 5 # unit in days + limit_search_runs: 10 # Max number of runs to be launched in parallel for grid search. + output_top_n: 10 # The result will print the top N fastest training configs. + max_steps_per_run: 100 # Max steps per run for the grid search. + max_minutes_per_run: 30 # minutes per run for the grid search. + tflops_per_gpu: 150 # Estimated tflops per GPU. + num_tokens_in_b: 300 # Unit in billions, typically 300B for GPT3 models. + vocab_size: 32000 + seq_length: 4096 # available seq_length list for GPT-3 models: [2048, 4096, 8192, 16384, 32768] + custom_config: /lustre/fsw/devtech/hpc-devtech/hongbinl/nemo_megatron/scripts/support_llama/NeMo-Megatron-Launcher/auto_configurator/base_configs/llama2_7b.yaml # path to custom .yaml model config instead of using auto-generated + logs: ${base_results_dir}/${search_config_value}_${.gpu_memory_gb}gb # Example base_results_dir/gpt3/126m + tensor_parallel_sizes: auto # auto to use our recommendation, or a list, such as [1, 2, 4, 8] + pipeline_parallel_sizes: auto # auto to use our recommendation, or a list, such as [1, 2, 4, 8, 10] + min_model_parallel_size: auto # auto to use our recommendation, or a value for the minimum desired parallelism + max_model_parallel_size: auto # auto to use our recommendation, or a value for the maximum desired parallelism + micro_batch_sizes: auto # auto to use our recommendation, or a list, such as [1, 2, 4, 8, 16] + act_ckpt_layers: auto # auto to use our recommendation, or a list, such as [0, 1, 2, 3] + +inference_settings: + run: + model_type: gpt3 + model_train_name: gpt3_5b + gpus_per_node: 8 + data_type: "fp16" # fp32|fp16|bf16 + time_limit: 0:30:00 + results_dir: ${base_results_dir}/${search_config_value}_${search_config.train_settings.gpu_memory_gb}gb + tensor_parallel_sizes: [1,2,4] + pipeline_parallel_sizes: [1,2] + benchmark: + input_len: 60 + output_len: 20 + batch_sizes: [4,8,16,32,64,128,256] + beam_width: 1 + topk: 4 + topp: 0.0 From fd3c364b25db89c2114f563afa0f1cee40cb357f Mon Sep 17 00:00:00 2001 From: Hongbin Liu Date: Fri, 8 Sep 2023 04:49:08 -0700 Subject: [PATCH 45/62] add sft support for llama Signed-off-by: Hongbin Liu --- .../conf/fine_tuning/llama/squad.yaml | 187 ++++++++++++++++++ launcher_scripts/nemo_launcher/core/stages.py | 1 + 2 files changed, 188 insertions(+) create mode 100644 launcher_scripts/conf/fine_tuning/llama/squad.yaml diff --git a/launcher_scripts/conf/fine_tuning/llama/squad.yaml b/launcher_scripts/conf/fine_tuning/llama/squad.yaml new file mode 100644 index 0000000000..0162280247 --- /dev/null +++ b/launcher_scripts/conf/fine_tuning/llama/squad.yaml @@ -0,0 +1,187 @@ +run: + name: ${.task_name}_${.model_train_name} + time_limit: "04:00:00" + dependency: "singleton" + convert_name: convert_nemo + model_train_name: llama_sft + convert_dir: ${base_results_dir}/${fine_tuning.run.model_train_name}/${fine_tuning.run.convert_name} + task_name: "squad" # Rename this name to be more clear + results_dir: ${base_results_dir}/${fine_tuning.run.model_train_name}/${fine_tuning.run.task_name} + +trainer: + devices: 8 + accelerator: gpu + num_nodes: 1 + precision: bf16 + logger: False # logger provided by exp_manager + enable_checkpointing: False + use_distributed_sampler: False + max_epochs: null + max_steps: 13000 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches + log_every_n_steps: 10 # frequency with which training steps are logged + val_check_interval: 300 # If is an int n > 1, will run val every n training steps, if a float 0.0 - 1.0 will run val every epoch fraction, e.g. 0.25 will run val every quarter epoch + gradient_clip_val: 1.0 + +exp_manager: + explicit_log_dir: ${fine_tuning.run.results_dir}/results + exp_dir: null + name: megatron_llama_${fine_tuning.run.task_name} + create_wandb_logger: False + wandb_logger_kwargs: + project: nemo_llama_${fine_tuning.run.task_name} + name: ${fine_tuning.run.name} + resume_if_exists: True + resume_ignore_no_checkpoint: True + create_checkpoint_callback: True + checkpoint_callback_params: + monitor: validation_${fine_tuning.model.data.validation_ds.metric.name} + save_top_k: 5 + mode: min + save_nemo_on_train_end: True + filename: 'megatron_gpt_sft--{${.monitor}:.3f}-{step}-{consumed_samples}' + model_parallel_size: ${multiply:${fine_tuning.model.tensor_model_parallel_size}, ${fine_tuning.model.pipeline_model_parallel_size}} + save_best_model: True + +model: + seed: 1234 + tensor_model_parallel_size: 1 # intra-layer model parallelism + pipeline_model_parallel_size: 1 # inter-layer model parallelism + global_batch_size: 32 + micro_batch_size: 4 + restore_from_path: ${fine_tuning.run.convert_dir}/results/megatron_llama.nemo # Path to an existing p-tuned/prompt tuned .nemo model you wish to add new tasks to or run inference with + resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc. + save_nemo_on_validation_end: False # Saves an inference ready .nemo file every time a checkpoint is saved during training. + sync_batch_comm: False + megatron_amp_O2: True + + ## Sequence Parallelism + # Makes tensor parallelism more memory efficient for LLMs (20B+) by parallelizing layer norms and dropout sequentially + # See Reducing Activation Recomputation in Large Transformer Models: https://arxiv.org/abs/2205.05198 for more details. + sequence_parallel: False + + ## Activation Checkpoint + activations_checkpoint_granularity: null # 'selective' or 'full' + activations_checkpoint_method: uniform # 'uniform', 'block', not used with 'selective' + # 'uniform' divides the total number of transformer layers and checkpoints the input activation + # of each chunk at the specified granularity + # 'block' checkpoints the specified number of layers per pipeline stage at the specified granularity + activations_checkpoint_num_layers: null # not used with 'selective' + answer_only_loss: True # not used right now + gradient_as_bucket_view: False + seq_len_interpolation_factor: null # if not None, seq_len_interpolation_factor will match the base model's value + use_flash_attention: True # if not None, will match the base model's value + + hidden_dropout: 0.1 + attention_dropout: 0.1 + ffn_dropout: 0.1 + + data: + chat: False # whether use chatbot data or not + train_ds: + # Example of how to specify paths to multiple datasets + # file_names: + # - /path/to/squad.jsonl + # - /path/to/mnli.jsonl + # - /path/to/boolq.jsonl + # Example of how each dataset is formatted + # {'input': 'John von Neumann\nVon Neumann made fundamental contributions .... Q: What did the math of artificial viscosity do?', 'output': 'smoothed the shock transition without sacrificing basic physics'} + file_names: + - ${data_dir}/squad_data/v1.1/train-v1.1_gpt.json # Path to a list of JSONL files corresponding to the source data. + global_batch_size: ${fine_tuning.model.global_batch_size} + micro_batch_size: ${fine_tuning.model.micro_batch_size} + shuffle: True + num_workers: 4 + pin_memory: True + max_seq_length: 4096 + min_seq_length: 1 + drop_last: True + # Example of how to specify concat_sampling_probabilities + # concat_sampling_probabilities: + # - 0.5 + # - 0.25 + # - 0.25 + concat_sampling_probabilities: + - 1.0 # When providing a list of datasets, this arg defines the sampling probabilities from each dataset when strategy='random' + context_key: 'input' + label_key: 'output' + add_eos: True + add_sep: False + add_bos: True + separate_prompt_and_response_with_newline: True + truncation_field: "context" # Options: ['context', 'answer'] + index_mapping_dir: null # Path to a directory to write index mapping files. + prompt_template: "{input} {output}" # fstring to use for assistant prompt. Example: "Q: {input}\nA: {output}" + + validation_ds: + file_names: + - ${data_dir}/squad_data/v1.1/dev-v1.1_gpt.json # Path to a list of JSONL files corresponding to the source data. Data format is identical to train_ds. + names: + - ${fine_tuning.run.task_name} # Names of the corresponding datasets used to log metrics. + global_batch_size: ${fine_tuning.model.global_batch_size} + micro_batch_size: ${fine_tuning.model.micro_batch_size} + shuffle: True + num_workers: 4 + pin_memory: True + max_seq_length: ${fine_tuning.model.data.train_ds.max_seq_length} + min_seq_length: ${fine_tuning.model.data.train_ds.min_seq_length} + drop_last: True + context_key: 'input' + label_key: 'output' + add_eos: ${fine_tuning.model.data.train_ds.add_eos} + add_sep: ${fine_tuning.model.data.train_ds.add_sep} + add_bos: ${fine_tuning.model.data.train_ds.add_bos} + separate_prompt_and_response_with_newline: ${fine_tuning.model.data.train_ds.separate_prompt_and_response_with_newline} + write_predictions_to_file: False + output_file_path_prefix: null # Prefix of the file to write predictions to. + truncation_field: "context" # Options: ['context', 'answer'] + index_mapping_dir: null # Path to a directory to write index mapping files. + prompt_template: ${fine_tuning.model.data.train_ds.prompt_template} # fstring to use for assistant prompt. Example: "Q: {input}\nA: {output}" + + metric: + name: "loss" # Name of the evaluation metric to use. Options: ['exact_string_match', 'loss'] + average: null # Average the metric over the dataset. Options: ['macro', 'micro']. Works only for 'F1', 'accuracy' etc. Refer to torchmetrics for metrics where this is supported. + num_classes: null + + test_ds: + file_names: ${fine_tuning.model.data.validation_ds.file_names} # Path to a list of JSONL files corresponding to the source data. Data format is identical to train_ds. + names: null # Names of the corresponding datasets used to log metrics. + global_batch_size: ${fine_tuning.model.global_batch_size} + micro_batch_size: ${fine_tuning.model.micro_batch_size} + shuffle: True + num_workers: 4 + pin_memory: True + max_seq_length: ${fine_tuning.model.data.train_ds.max_seq_length} + min_seq_length: ${fine_tuning.model.data.train_ds.min_seq_length} + drop_last: True + context_key: 'input' + label_key: 'output' + add_eos: ${fine_tuning.model.data.train_ds.add_eos} + add_sep: ${fine_tuning.model.data.train_ds.add_sep} + add_bos: ${fine_tuning.model.data.train_ds.add_bos} + separate_prompt_and_response_with_newline: ${fine_tuning.model.data.train_ds.separate_prompt_and_response_with_newline} + write_predictions_to_file: False + output_file_path_prefix: null # Prefix of the file to write predictions to. + truncation_field: "context" # Options: ['context', 'answer'] + index_mapping_dir: null # Path to a directory to write index mapping files. + prompt_template: ${fine_tuning.model.data.train_ds.prompt_template} # fstring to use for assistant prompt. Example: "Q: {input}\nA: {output}" + + metric: + name: "loss" # Name of the evaluation metric to use. Options: ['exact_string_match', 'loss'] + average: null # Average the metric over the dataset. Options: ['macro', 'micro']. Works only for 'F1', 'accuracy' etc. Refer to torchmetrics for metrics where this is supported. + num_classes: null + + optim: + name: distributed_fused_adam # Supports distributed optimizer for memory savings. To enable, set to 'distributed_fused_adam'. Needs Apex to be built with specific args to work. + lr: 1e-6 + weight_decay: 0.1 + betas: + - 0.9 + - 0.98 + sched: + name: CosineAnnealing + monitor: validation_${fine_tuning.model.data.validation_ds.metric.name} + min_lr: 1e-8 + warmup_steps: 1000 + last_epoch: -1 + + diff --git a/launcher_scripts/nemo_launcher/core/stages.py b/launcher_scripts/nemo_launcher/core/stages.py index ebe0e9c210..291b2c565d 100755 --- a/launcher_scripts/nemo_launcher/core/stages.py +++ b/launcher_scripts/nemo_launcher/core/stages.py @@ -682,6 +682,7 @@ def _get_nemo_code_path(self, model_type: str) -> Path: model_type_to_code_path = { "gpt3" : self._nemo_code_path / "examples/nlp/language_modeling/tuning/megatron_gpt_sft.py", + "llama" : self._nemo_code_path / "examples/nlp/language_modeling/tuning/megatron_gpt_sft.py", "t5": self._nemo_code_path / "examples/nlp/language_modeling/megatron_t5_seq2seq_finetune.py", "mt5": self._nemo_code_path / "examples/nlp/language_modeling/megatron_t5_seq2seq_finetune.py", } From d9a30597a5a035c2a91338dec1558e098dfed2b1 Mon Sep 17 00:00:00 2001 From: Hongbin Liu Date: Fri, 8 Sep 2023 07:01:51 -0700 Subject: [PATCH 46/62] add peft support for llama Signed-off-by: Hongbin Liu --- .../conf/fine_tuning/llama/squad.yaml | 2 +- launcher_scripts/conf/peft/llama/squad.yaml | 234 ++++++++++++++++++ launcher_scripts/nemo_launcher/core/stages.py | 1 + 3 files changed, 236 insertions(+), 1 deletion(-) create mode 100644 launcher_scripts/conf/peft/llama/squad.yaml diff --git a/launcher_scripts/conf/fine_tuning/llama/squad.yaml b/launcher_scripts/conf/fine_tuning/llama/squad.yaml index 0162280247..cc954a846f 100644 --- a/launcher_scripts/conf/fine_tuning/llama/squad.yaml +++ b/launcher_scripts/conf/fine_tuning/llama/squad.yaml @@ -3,7 +3,7 @@ run: time_limit: "04:00:00" dependency: "singleton" convert_name: convert_nemo - model_train_name: llama_sft + model_train_name: llama2_7b convert_dir: ${base_results_dir}/${fine_tuning.run.model_train_name}/${fine_tuning.run.convert_name} task_name: "squad" # Rename this name to be more clear results_dir: ${base_results_dir}/${fine_tuning.run.model_train_name}/${fine_tuning.run.task_name} diff --git a/launcher_scripts/conf/peft/llama/squad.yaml b/launcher_scripts/conf/peft/llama/squad.yaml new file mode 100644 index 0000000000..c958ba30dc --- /dev/null +++ b/launcher_scripts/conf/peft/llama/squad.yaml @@ -0,0 +1,234 @@ +name: megatron_llama_peft_tuning-${peft.model.peft.peft_scheme} + +run: + name: ${.task_name}_${.model_train_name} + time_limit: "04:00:00" + dependency: "singleton" + convert_name: convert_nemo + model_train_name: llama2_7b + convert_dir: ${base_results_dir}/${peft.run.model_train_name}/${peft.run.convert_name} + task_name: "squad" + results_dir: ${base_results_dir}/${.model_train_name}/peft_${.task_name} + +trainer: + devices: 8 + accelerator: gpu + num_nodes: 1 + precision: bf16 + logger: False # logger provided by exp_manager + enable_checkpointing: False + use_distributed_sampler: False + max_epochs: 9999 + max_steps: 20000 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches + log_every_n_steps: 10 # frequency with which training steps are logged + val_check_interval: 200 # If is an int n > 1, will run val every n training steps, if a float 0.0 - 1.0 will run val every epoch fraction, e.g. 0.25 will run val every quarter epoch + gradient_clip_val: 1.0 + +exp_manager: + explicit_log_dir: null + exp_dir: null + name: ${peft.name} + create_wandb_logger: False + wandb_logger_kwargs: + project: null + name: null + resume_if_exists: True + resume_ignore_no_checkpoint: True + create_checkpoint_callback: True + checkpoint_callback_params: + monitor: validation_${peft.model.data.validation_ds.metric.name} + save_top_k: 1 + mode: min + save_nemo_on_train_end: True + filename: '${peft.name}--{${peft.exp_manager.checkpoint_callback_params.monitor}:.3f}-{step}-{consumed_samples}' + model_parallel_size: ${peft.model.tensor_model_parallel_size} + always_save_nemo: False + save_best_model: True + create_early_stopping_callback: True + early_stopping_callback_params: + monitor: "val_loss" + mode: "min" + min_delta: 0.001 + patience: 10 + verbose: True + strict: False # Should be False to avoid a runtime error where EarlyStopping says monitor is unavailable, which sometimes happens with resumed training. + +model: + seed: 1234 + tensor_model_parallel_size: 1 # intra-layer model parallelism + pipeline_model_parallel_size: 1 # inter-layer model parallelism + + global_batch_size: 128 + micro_batch_size: 4 + restore_from_path: ${fine_tuning.run.convert_dir}/results/megatron_llama.nemo # Path to an existing .nemo model you wish to add new tasks to or run inference with + resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc. + save_nemo_on_validation_end: False # Saves an inference ready .nemo file every time a checkpoint is saved during training. + sync_batch_comm: False + megatron_amp_O2: True + + ## Sequence Parallelism + # Makes tensor parallelism more memory efficient for LLMs (20B+) by parallelizing layer norms and dropout sequentially + # See Reducing Activation Recomputation in Large Transformer Models: https://arxiv.org/abs/2205.05198 for more details. + sequence_parallel: False + + ## Activation Checkpoint + activations_checkpoint_granularity: null # 'selective' or 'full' + activations_checkpoint_method: null # 'uniform', 'block', not used with 'selective' + # 'uniform' divides the total number of transformer layers and checkpoints the input activation + # of each chunk at the specified granularity + # 'block' checkpoints the specified number of layers per pipeline stage at the specified granularity + activations_checkpoint_num_layers: null # not used with 'selective' + activations_checkpoint_layers_per_pipeline: null + answer_only_loss: True + gradient_as_bucket_view: False + + hidden_dropout: 0.0 + attention_dropout: 0.0 + ffn_dropout: 0.0 + + peft: + peft_scheme: "lora" # can be either adapter,ia3, or ptuning + restore_from_path: null + + # Used for adapter peft training + adapter_tuning: + type: 'parallel_adapter' # this should be either 'parallel_adapter' or 'linear_adapter' + adapter_dim: 32 + adapter_dropout: 0.0 + norm_position: 'pre' # This can be set to 'pre', 'post' or null, 'pre' is normally what is used. + column_init_method: 'xavier' # IGNORED if linear_adapter is used, options: xavier, zero or normal + row_init_method: 'zero' # IGNORED if linear_adapter is used, options: xavier, zero or normal + norm_type: 'mixedfusedlayernorm' # IGNORED if layer_adapter is used, options are ['layernorm', 'mixedfusedlayernorm'] + layer_selection: null # selects in which layers to add adapters, e.g. [1,12] will add adapters to layer 1 (lowest) and 12. null will apply adapters to all layers + weight_tying: False + position_embedding_strategy: null # used only when weight_tying is True + + lora_tuning: + adapter_dim: 32 + adapter_dropout: 0.0 + column_init_method: 'xavier' # IGNORED if linear_adapter is used, options: xavier, zero or normal + row_init_method: 'zero' # IGNORED if linear_adapter is used, options: xavier, zero or normal + layer_selection: null # selects in which layers to add lora adapters. e.g. [1,12] will add lora to layer 1 (lowest) and 12. null will apply adapters to all layers + weight_tying: False + position_embedding_strategy: null # used only when weight_tying is True + + # Used for p-tuning peft training + p_tuning: + virtual_tokens: 10 # The number of virtual tokens the prompt encoder should add at the start of the sequence + bottleneck_dim: 1024 # the size of the prompt encoder mlp bottleneck + embedding_dim: 1024 # the size of the prompt encoder embeddings + init_std: 0.023 + + ia3_tuning: + layer_selection: null # selects in which layers to add ia3 adapters. e.g. [1,12] will add lora to layer 1 (lowest) and 12. null will apply adapters to all layers + + data: + train_ds: + # Example of how to specify paths to multiple datasets + # file_names: + # - /path/to/squad.jsonl + # - /path/to/mnli.jsonl + # - /path/to/boolq.jsonl + # Example of how each dataset is formatted + # {'input': 'John von Neumann\nVon Neumann made fundamental contributions .... Q: What did the math of artificial viscosity do?', 'output': 'smoothed the shock transition without sacrificing basic physics'} + file_names: + - ${data_dir}/squad_data/v1.1/train-v1.1_gpt.json # Path to a list of JSONL files corresponding to the source data. + global_batch_size: ${peft.model.global_batch_size} + micro_batch_size: ${peft.model.micro_batch_size} + shuffle: True + num_workers: 0 + memmap_workers: 2 + pin_memory: True + max_seq_length: 4096 + min_seq_length: 1 + drop_last: True + # Example of how to specify concat_sampling_probabilities + # concat_sampling_probabilities: + # - 0.5 + # - 0.25 + # - 0.25 + concat_sampling_probabilities: + - 1.0 # When providing a list of datasets, this arg defines the sampling probabilities from each dataset when strategy='random' + context_key: 'input' + label_key: 'output' + add_eos: True + add_sep: False + add_bos: True + separate_prompt_and_response_with_newline: False + truncation_field: "context" # Options: ['context', 'answer'] + index_mapping_dir: null # Path to a directory to write index mapping files. + prompt_template: "{input} {output}" # fstring to use for assistant prompt. Example: "Q: {input}\nA: {output}" + + validation_ds: + file_names: + - ${data_dir}/squad_data/v1.1/dev-v1.1_gpt.json # Path to a list of JSONL files corresponding to the source data. Data format is identical to train_ds. + names: + - ${fine_tuning.run.task_name} # Names of the corresponding datasets used to log metrics. + global_batch_size: ${peft.model.global_batch_size} + micro_batch_size: ${peft.model.micro_batch_size} + shuffle: False + num_workers: 0 + memmap_workers: ${peft.model.data.train_ds.memmap_workers} + pin_memory: True + max_seq_length: ${fine_tuning.model.data.train_ds.max_seq_length} + min_seq_length: ${fine_tuning.model.data.train_ds.min_seq_length} + drop_last: False + context_key: 'input' + label_key: 'output' + add_eos: ${peft.model.data.train_ds.add_eos} + add_sep: ${peft.model.data.train_ds.add_sep} + add_bos: ${peft.model.data.train_ds.add_bos} + separate_prompt_and_response_with_newline: ${peft.model.data.train_ds.separate_prompt_and_response_with_newline} + write_predictions_to_file: False + output_file_path_prefix: null # Prefix of the file to write predictions to. + truncation_field: "context" # Options: ['context', 'answer'] + index_mapping_dir: null # Path to a directory to write index mapping files. + prompt_template: ${peft.model.data.train_ds.prompt_template} # fstring to use for assistant prompt. Example: "Q: {input}\nA: {output}" + tokens_to_generate: 32 # decide how many tokens we want to generate to evaluate performance with string metrics + metric: + name: "loss" # Name of the evaluation metric to use. Options: ['exact_string_match', 'loss'] + average: null # Average the metric over the dataset. Options: ['macro', 'micro']. Works only for 'F1', 'accuracy' etc. Refer to torchmetrics for metrics where this is supported. + num_classes: null + test_ds: + file_names: ${peft.model.data.validation_ds.file_names} # Path to a list of JSONL files corresponding to the source data. Data format is identical to train_ds. + names: null # Names of the corresponding datasets used to log metrics. + global_batch_size: ${peft.model.global_batch_size} + micro_batch_size: ${peft.model.micro_batch_size} + shuffle: False + num_workers: 0 + memmap_workers: ${peft.model.data.train_ds.memmap_workers} + pin_memory: True + max_seq_length: ${fine_tuning.model.data.train_ds.max_seq_length} + min_seq_length: ${fine_tuning.model.data.train_ds.min_seq_length} + drop_last: False + context_key: 'input' + label_key: 'output' + add_eos: ${peft.model.data.train_ds.add_eos} + add_sep: ${peft.model.data.train_ds.add_sep} + add_bos: ${peft.model.data.train_ds.add_bos} + separate_prompt_and_response_with_newline: ${peft.model.data.train_ds.separate_prompt_and_response_with_newline} + write_predictions_to_file: False + output_file_path_prefix: null # Prefix of the file to write predictions to. + truncation_field: "context" # Options: ['context', 'answer'] + index_mapping_dir: null # Path to a directory to write index mapping files. + prompt_template: ${peft.model.data.train_ds.prompt_template} + tokens_to_generate: 32 # decide how many tokens we want to generate to evaluate performance with string metrics + metric: + name: "loss" # Name of the evaluation metric to use. Options: ['exact_string_match', 'loss'] + average: null # Average the metric over the dataset. Options: ['macro', 'micro']. Works only for 'F1', 'accuracy' etc. Refer to torchmetrics for metrics where this is supported. + num_classes: null + + optim: + name: fused_adam + lr: 1e-4 + weight_decay: 0.01 + betas: + - 0.9 + - 0.98 + sched: + name: CosineAnnealing + warmup_steps: 50 + min_lr: 0.0 # min_lr must be 0.0 for prompt learning when pipeline parallel > 1 + constant_steps: 0 # Constant steps should also be 0 when min_lr=0 + monitor: val_loss + reduce_on_plateau: false diff --git a/launcher_scripts/nemo_launcher/core/stages.py b/launcher_scripts/nemo_launcher/core/stages.py index 291b2c565d..06fd8fa261 100755 --- a/launcher_scripts/nemo_launcher/core/stages.py +++ b/launcher_scripts/nemo_launcher/core/stages.py @@ -725,6 +725,7 @@ def _get_nemo_code_path(self, model_type: str) -> Path: raise NotImplementedError("PEFT is not supported in NeMo Megatron mt5 models.") model_type_to_code_path = { "gpt3": self._nemo_code_path / "examples/nlp/language_modeling/tuning/megatron_gpt_peft_tuning.py", + "llama": self._nemo_code_path / "examples/nlp/language_modeling/tuning/megatron_gpt_peft_tuning.py", } return model_type_to_code_path[model_type] From 5d011e3b0bc2cc377b243ca9effadcf6111f4651 Mon Sep 17 00:00:00 2001 From: Abhinav Khattar Date: Fri, 8 Sep 2023 10:59:47 -0700 Subject: [PATCH 47/62] update tests Signed-off-by: Abhinav Khattar --- launcher_scripts/conf/training/bert/100b.yaml | 10 +++++----- launcher_scripts/conf/training/bert/110m.yaml | 6 +++--- launcher_scripts/conf/training/bert/20b.yaml | 6 +++--- launcher_scripts/conf/training/bert/4b.yaml | 6 +++--- launcher_scripts/conf/training/gpt3/126m.yaml | 6 +++--- launcher_scripts/conf/training/gpt3/175b.yaml | 6 +++--- .../conf/training/gpt3/175b_performance.yaml | 6 +++--- launcher_scripts/conf/training/gpt3/1b_improved.yaml | 6 +++--- launcher_scripts/conf/training/gpt3/20b.yaml | 6 +++--- launcher_scripts/conf/training/gpt3/400m_improved.yaml | 6 +++--- launcher_scripts/conf/training/gpt3/40b.yaml | 6 +++--- launcher_scripts/conf/training/gpt3/40b_improved.yaml | 6 +++--- launcher_scripts/conf/training/gpt3/5b.yaml | 6 +++--- launcher_scripts/conf/training/gpt3/7b_improved.yaml | 6 +++--- 14 files changed, 44 insertions(+), 44 deletions(-) diff --git a/launcher_scripts/conf/training/bert/100b.yaml b/launcher_scripts/conf/training/bert/100b.yaml index 84d7170dae..8d26a5b7b8 100755 --- a/launcher_scripts/conf/training/bert/100b.yaml +++ b/launcher_scripts/conf/training/bert/100b.yaml @@ -97,11 +97,11 @@ model: onnx_safe: False # Use work-arounds for known problems with Torch ONNX exporter. gradient_as_bucket_view: True # PyTorch DDP argument. Allocate gradients in a contiguous bucket to save memory (less fragmentation and buffer memory) - activations_checkpoint_granularity: selective - activations_checkpoint_method: block - activations_checkpoint_layers_per_pipeline: 1 - num_micro_batches_with_partial_activation_checkpoints: 96 - activations_checkpoint_num_layers: 0 + activations_checkpoint_granularity: null + activations_checkpoint_method: null + activations_checkpoint_layers_per_pipeline: null + num_micro_batches_with_partial_activation_checkpoints: null + activations_checkpoint_num_layers: null sequence_parallel: True diff --git a/launcher_scripts/conf/training/bert/110m.yaml b/launcher_scripts/conf/training/bert/110m.yaml index 8d72872eb2..2988141040 100755 --- a/launcher_scripts/conf/training/bert/110m.yaml +++ b/launcher_scripts/conf/training/bert/110m.yaml @@ -98,11 +98,11 @@ model: gradient_as_bucket_view: True # PyTorch DDP argument. Allocate gradients in a contiguous bucket to save memory (less fragmentation and buffer memory) - activations_checkpoint_granularity: selective - activations_checkpoint_method: block + activations_checkpoint_granularity: null + activations_checkpoint_method: null activations_checkpoint_layers_per_pipeline: null num_micro_batches_with_partial_activation_checkpoints: null - activations_checkpoint_num_layers: 0 + activations_checkpoint_num_layers: null sequence_parallel: False diff --git a/launcher_scripts/conf/training/bert/20b.yaml b/launcher_scripts/conf/training/bert/20b.yaml index 729b8e0ef7..1a2d033c7e 100755 --- a/launcher_scripts/conf/training/bert/20b.yaml +++ b/launcher_scripts/conf/training/bert/20b.yaml @@ -97,11 +97,11 @@ model: onnx_safe: False # Use work-arounds for known problems with Torch ONNX exporter. gradient_as_bucket_view: True # PyTorch DDP argument. Allocate gradients in a contiguous bucket to save memory (less fragmentation and buffer memory) - activations_checkpoint_granularity: selective - activations_checkpoint_method: block + activations_checkpoint_granularity: null + activations_checkpoint_method: null activations_checkpoint_layers_per_pipeline: null num_micro_batches_with_partial_activation_checkpoints: null - activations_checkpoint_num_layers: 1 + activations_checkpoint_num_layers: null sequence_parallel: True diff --git a/launcher_scripts/conf/training/bert/4b.yaml b/launcher_scripts/conf/training/bert/4b.yaml index e925f5621a..484f17c998 100755 --- a/launcher_scripts/conf/training/bert/4b.yaml +++ b/launcher_scripts/conf/training/bert/4b.yaml @@ -97,11 +97,11 @@ model: onnx_safe: False # Use work-arounds for known problems with Torch ONNX exporter. gradient_as_bucket_view: True # PyTorch DDP argument. Allocate gradients in a contiguous bucket to save memory (less fragmentation and buffer memory) - activations_checkpoint_granularity: selective - activations_checkpoint_method: block + activations_checkpoint_granularity: null + activations_checkpoint_method: null activations_checkpoint_layers_per_pipeline: null num_micro_batches_with_partial_activation_checkpoints: null - activations_checkpoint_num_layers: 0 + activations_checkpoint_num_layers: null sequence_parallel: False diff --git a/launcher_scripts/conf/training/gpt3/126m.yaml b/launcher_scripts/conf/training/gpt3/126m.yaml index 2e8cd73053..b816072f74 100755 --- a/launcher_scripts/conf/training/gpt3/126m.yaml +++ b/launcher_scripts/conf/training/gpt3/126m.yaml @@ -82,9 +82,9 @@ model: masked_softmax_fusion: True # Use a kernel that fuses the attention softmax with it's mask. ## Activation Checkpointing - activations_checkpoint_granularity: selective # 'selective' or 'full' - activations_checkpoint_method: block # 'uniform', 'block' - activations_checkpoint_num_layers: 0 + activations_checkpoint_granularity: null # 'selective' or 'full' + activations_checkpoint_method: null # 'uniform', 'block' + activations_checkpoint_num_layers: null num_micro_batches_with_partial_activation_checkpoints: null activations_checkpoint_layers_per_pipeline: null diff --git a/launcher_scripts/conf/training/gpt3/175b.yaml b/launcher_scripts/conf/training/gpt3/175b.yaml index 971bd4ccdb..37c6672ccc 100755 --- a/launcher_scripts/conf/training/gpt3/175b.yaml +++ b/launcher_scripts/conf/training/gpt3/175b.yaml @@ -81,9 +81,9 @@ model: masked_softmax_fusion: True # Use a kernel that fuses the attention softmax with it's mask. ## Activation Checkpointing - activations_checkpoint_granularity: selective # 'selective' or 'full' - activations_checkpoint_method: block # 'uniform', 'block' - activations_checkpoint_num_layers: 0 + activations_checkpoint_granularity: null # 'selective' or 'full' + activations_checkpoint_method: null # 'uniform', 'block' + activations_checkpoint_num_layers: null num_micro_batches_with_partial_activation_checkpoints: null activations_checkpoint_layers_per_pipeline: null diff --git a/launcher_scripts/conf/training/gpt3/175b_performance.yaml b/launcher_scripts/conf/training/gpt3/175b_performance.yaml index 780e636ba8..1b615f3b08 100755 --- a/launcher_scripts/conf/training/gpt3/175b_performance.yaml +++ b/launcher_scripts/conf/training/gpt3/175b_performance.yaml @@ -83,9 +83,9 @@ model: masked_softmax_fusion: True # Use a kernel that fuses the attention softmax with it's mask. ## Activation Checkpointing - activations_checkpoint_granularity: selective # 'selective' or 'full' - activations_checkpoint_method: block # 'uniform', 'block' - activations_checkpoint_num_layers: 0 + activations_checkpoint_granularity: null # 'selective' or 'full' + activations_checkpoint_method: null # 'uniform', 'block' + activations_checkpoint_num_layers: null num_micro_batches_with_partial_activation_checkpoints: null activations_checkpoint_layers_per_pipeline: null diff --git a/launcher_scripts/conf/training/gpt3/1b_improved.yaml b/launcher_scripts/conf/training/gpt3/1b_improved.yaml index e6d473c840..23917f0b6f 100644 --- a/launcher_scripts/conf/training/gpt3/1b_improved.yaml +++ b/launcher_scripts/conf/training/gpt3/1b_improved.yaml @@ -110,9 +110,9 @@ model: apex_transformer_log_level: 30 gradient_as_bucket_view: true sync_batch_comm: false - activations_checkpoint_granularity: selective - activations_checkpoint_method: block - activations_checkpoint_num_layers: 0 + activations_checkpoint_granularity: null + activations_checkpoint_method: null + activations_checkpoint_num_layers: null num_micro_batches_with_partial_activation_checkpoints: null activations_checkpoint_layers_per_pipeline: null sequence_parallel: false # does not support sequence parallel diff --git a/launcher_scripts/conf/training/gpt3/20b.yaml b/launcher_scripts/conf/training/gpt3/20b.yaml index b4185d922f..d677565daa 100755 --- a/launcher_scripts/conf/training/gpt3/20b.yaml +++ b/launcher_scripts/conf/training/gpt3/20b.yaml @@ -81,9 +81,9 @@ model: masked_softmax_fusion: True # Use a kernel that fuses the attention softmax with it's mask. ## Activation Checkpointing - activations_checkpoint_granularity: selective # 'selective' or 'full' - activations_checkpoint_method: block # 'uniform', 'block' - activations_checkpoint_num_layers: 0 + activations_checkpoint_granularity: null # 'selective' or 'full' + activations_checkpoint_method: null # 'uniform', 'block' + activations_checkpoint_num_layers: null num_micro_batches_with_partial_activation_checkpoints: null activations_checkpoint_layers_per_pipeline: null diff --git a/launcher_scripts/conf/training/gpt3/400m_improved.yaml b/launcher_scripts/conf/training/gpt3/400m_improved.yaml index e4b4a6e31f..5666be015c 100644 --- a/launcher_scripts/conf/training/gpt3/400m_improved.yaml +++ b/launcher_scripts/conf/training/gpt3/400m_improved.yaml @@ -110,9 +110,9 @@ model: apex_transformer_log_level: 30 gradient_as_bucket_view: true sync_batch_comm: false - activations_checkpoint_granularity: selective - activations_checkpoint_method: block - activations_checkpoint_num_layers: 0 + activations_checkpoint_granularity: null + activations_checkpoint_method: null + activations_checkpoint_num_layers: null num_micro_batches_with_partial_activation_checkpoints: null activations_checkpoint_layers_per_pipeline: null sequence_parallel: false # does not support sequence parallel diff --git a/launcher_scripts/conf/training/gpt3/40b.yaml b/launcher_scripts/conf/training/gpt3/40b.yaml index bf660ea9b4..2350d0ed07 100755 --- a/launcher_scripts/conf/training/gpt3/40b.yaml +++ b/launcher_scripts/conf/training/gpt3/40b.yaml @@ -81,9 +81,9 @@ model: masked_softmax_fusion: True # Use a kernel that fuses the attention softmax with it's mask. ## Activation Checkpointing - activations_checkpoint_granularity: selective # 'selective' or 'full' - activations_checkpoint_method: block # 'uniform', 'block' - activations_checkpoint_num_layers: 0 + activations_checkpoint_granularity: null # 'selective' or 'full' + activations_checkpoint_method: null # 'uniform', 'block' + activations_checkpoint_num_layers: null num_micro_batches_with_partial_activation_checkpoints: null activations_checkpoint_layers_per_pipeline: null diff --git a/launcher_scripts/conf/training/gpt3/40b_improved.yaml b/launcher_scripts/conf/training/gpt3/40b_improved.yaml index af5f14f2b2..d96532dbbc 100644 --- a/launcher_scripts/conf/training/gpt3/40b_improved.yaml +++ b/launcher_scripts/conf/training/gpt3/40b_improved.yaml @@ -110,9 +110,9 @@ model: apex_transformer_log_level: 30 gradient_as_bucket_view: true sync_batch_comm: false - activations_checkpoint_granularity: selective - activations_checkpoint_method: block - activations_checkpoint_num_layers: 1 + activations_checkpoint_granularity: null + activations_checkpoint_method: null + activations_checkpoint_num_layers: null num_micro_batches_with_partial_activation_checkpoints: null activations_checkpoint_layers_per_pipeline: null sequence_parallel: false # does not support sequence parallel diff --git a/launcher_scripts/conf/training/gpt3/5b.yaml b/launcher_scripts/conf/training/gpt3/5b.yaml index 0ba8d80b89..2981029d91 100755 --- a/launcher_scripts/conf/training/gpt3/5b.yaml +++ b/launcher_scripts/conf/training/gpt3/5b.yaml @@ -81,9 +81,9 @@ model: masked_softmax_fusion: True # Use a kernel that fuses the attention softmax with it's mask. ## Activation Checkpointing - activations_checkpoint_granularity: selective # 'selective' or 'full' - activations_checkpoint_method: block # 'uniform', 'block' - activations_checkpoint_num_layers: 0 + activations_checkpoint_granularity: null # 'selective' or 'full' + activations_checkpoint_method: null # 'uniform', 'block' + activations_checkpoint_num_layers: null num_micro_batches_with_partial_activation_checkpoints: null activations_checkpoint_layers_per_pipeline: null diff --git a/launcher_scripts/conf/training/gpt3/7b_improved.yaml b/launcher_scripts/conf/training/gpt3/7b_improved.yaml index 8cd14cad59..ffe7f89e55 100644 --- a/launcher_scripts/conf/training/gpt3/7b_improved.yaml +++ b/launcher_scripts/conf/training/gpt3/7b_improved.yaml @@ -110,9 +110,9 @@ model: apex_transformer_log_level: 30 gradient_as_bucket_view: true sync_batch_comm: false - activations_checkpoint_granularity: selective - activations_checkpoint_method: block - activations_checkpoint_num_layers: 8 + activations_checkpoint_granularity: null + activations_checkpoint_method: null + activations_checkpoint_num_layers: null num_micro_batches_with_partial_activation_checkpoints: null activations_checkpoint_layers_per_pipeline: null sequence_parallel: false # does not support sequence parallel From 5b2d3ba3fe482055677234e57ec78de674cc324e Mon Sep 17 00:00:00 2001 From: Eric Harper Date: Sat, 9 Sep 2023 12:54:39 -0600 Subject: [PATCH 48/62] add if Signed-off-by: Eric Harper --- launcher_scripts/nemo_launcher/collections/conditional_cfgs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/launcher_scripts/nemo_launcher/collections/conditional_cfgs.py b/launcher_scripts/nemo_launcher/collections/conditional_cfgs.py index ec6be9845f..57a2d3eae5 100644 --- a/launcher_scripts/nemo_launcher/collections/conditional_cfgs.py +++ b/launcher_scripts/nemo_launcher/collections/conditional_cfgs.py @@ -54,7 +54,7 @@ def get_ag_overlap(cfg): if __name__ == "__main__": - elif sys.argv[1] == "name=get_ln_sm_margin": + if sys.argv[1] == "name=get_ln_sm_margin": get_ln_sm_margin() elif sys.argv[1] == "name=get_ag_overlap": get_ag_overlap() From 76436a952dcfc6fcae9aff5869dd4409e133971a Mon Sep 17 00:00:00 2001 From: Maanu Grover Date: Mon, 11 Sep 2023 17:39:12 -0700 Subject: [PATCH 49/62] remove ft from auto_configurator config --- auto_configurator/conf/config.yaml | 1 - 1 file changed, 1 deletion(-) diff --git a/auto_configurator/conf/config.yaml b/auto_configurator/conf/config.yaml index aa75cfff7a..ab712b4a83 100644 --- a/auto_configurator/conf/config.yaml +++ b/auto_configurator/conf/config.yaml @@ -15,7 +15,6 @@ run_inference_hp_search: True cluster_type: bcm # bcm or bcp auto_configurator_path: ??? # Path to the location of auto_configurator codebase. launcher_scripts_path: ${auto_configurator_path}/../launcher_scripts -fastertransformer_path: ${auto_configurator_path}/../FasterTransformer base_results_dir: ${auto_configurator_path}/results data_dir: ${launcher_scripts_path}/data From 48dea84ea390c1b834a98b73ce2b4a970b98eabd Mon Sep 17 00:00:00 2001 From: Maanu Grover Date: Mon, 11 Sep 2023 19:39:32 -0700 Subject: [PATCH 50/62] change print --- launcher_scripts/nemo_launcher/collections/conditional_cfgs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/launcher_scripts/nemo_launcher/collections/conditional_cfgs.py b/launcher_scripts/nemo_launcher/collections/conditional_cfgs.py index 57a2d3eae5..f1fff5c18a 100644 --- a/launcher_scripts/nemo_launcher/collections/conditional_cfgs.py +++ b/launcher_scripts/nemo_launcher/collections/conditional_cfgs.py @@ -31,7 +31,7 @@ def get_ln_sm_margin(cfg): """ global cuda_capability if cuda_capability == 9: - print(4) + print(8) else: print(0) From b89407cc4218a72e5f6d62f4ad50d018616e119d Mon Sep 17 00:00:00 2001 From: David Date: Mon, 11 Sep 2023 20:33:11 -0700 Subject: [PATCH 51/62] updating GPT configs (mcore and te) (#124) Signed-off-by: David Mosallanezhad --- launcher_scripts/conf/training/gpt3/126m.yaml | 5 ++++- launcher_scripts/conf/training/gpt3/175b.yaml | 5 ++++- launcher_scripts/conf/training/gpt3/175b_performance.yaml | 5 ++++- launcher_scripts/conf/training/gpt3/1b_improved.yaml | 3 +++ launcher_scripts/conf/training/gpt3/20b.yaml | 5 ++++- launcher_scripts/conf/training/gpt3/400m_improved.yaml | 3 +++ launcher_scripts/conf/training/gpt3/40b.yaml | 5 ++++- launcher_scripts/conf/training/gpt3/40b_improved.yaml | 3 +++ launcher_scripts/conf/training/gpt3/5b.yaml | 5 ++++- launcher_scripts/conf/training/gpt3/7b_improved.yaml | 3 +++ 10 files changed, 36 insertions(+), 6 deletions(-) diff --git a/launcher_scripts/conf/training/gpt3/126m.yaml b/launcher_scripts/conf/training/gpt3/126m.yaml index a6c7028da7..bd719118af 100755 --- a/launcher_scripts/conf/training/gpt3/126m.yaml +++ b/launcher_scripts/conf/training/gpt3/126m.yaml @@ -118,10 +118,13 @@ model: megatron_amp_O2: True # Enable O2-level automatic mixed precision using master parameters grad_allreduce_chunk_size_mb: 125 + ## Using Megatron Core + mcore_gpt: True + ## Transformer Engine # To use fp8, please set `transformer_engine=True` and `fp8=True`. # The rest of fp8 knobs are set for the fp8 training mode, which are ignored in non-fp8 training - transformer_engine: True + transformer_engine: False fp8: False # enables fp8 in TransformerLayer forward fp8_e4m3: False # sets fp8_format = recipe.Format.E4M3 fp8_hybrid: True # sets fp8_format = recipe.Format.HYBRID diff --git a/launcher_scripts/conf/training/gpt3/175b.yaml b/launcher_scripts/conf/training/gpt3/175b.yaml index 187edaf3f7..df8049ba77 100755 --- a/launcher_scripts/conf/training/gpt3/175b.yaml +++ b/launcher_scripts/conf/training/gpt3/175b.yaml @@ -117,10 +117,13 @@ model: megatron_amp_O2: True # Enable O2-level automatic mixed precision using master parameters grad_allreduce_chunk_size_mb: 125 + ## Using Megatron Core + mcore_gpt: True + ## Transformer Engine # To use fp8, please set `transformer_engine=True` and `fp8=True`. # The rest of fp8 knobs are set for the fp8 training mode, which are ignored in non-fp8 training - transformer_engine: True + transformer_engine: False fp8: False # enables fp8 in TransformerLayer forward fp8_e4m3: False # sets fp8_format = recipe.Format.E4M3 fp8_hybrid: True # sets fp8_format = recipe.Format.HYBRID diff --git a/launcher_scripts/conf/training/gpt3/175b_performance.yaml b/launcher_scripts/conf/training/gpt3/175b_performance.yaml index 68dccabf9b..7c0d55b37f 100755 --- a/launcher_scripts/conf/training/gpt3/175b_performance.yaml +++ b/launcher_scripts/conf/training/gpt3/175b_performance.yaml @@ -117,10 +117,13 @@ model: megatron_amp_O2: True # Enable O2-level automatic mixed precision using master parameters grad_allreduce_chunk_size_mb: 125 + ## Using Megatron Core + mcore_gpt: True + ## Transformer Engine # To use fp8, please set `transformer_engine=True` and `fp8=True`. # The rest of fp8 knobs are set for the fp8 training mode, which are ignored in non-fp8 training - transformer_engine: True + transformer_engine: False fp8: False # enables fp8 in TransformerLayer forward fp8_e4m3: False # sets fp8_format = recipe.Format.E4M3 fp8_hybrid: True # sets fp8_format = recipe.Format.HYBRID diff --git a/launcher_scripts/conf/training/gpt3/1b_improved.yaml b/launcher_scripts/conf/training/gpt3/1b_improved.yaml index 22b1bd6fad..173497fc55 100644 --- a/launcher_scripts/conf/training/gpt3/1b_improved.yaml +++ b/launcher_scripts/conf/training/gpt3/1b_improved.yaml @@ -125,6 +125,9 @@ model: batch_p2p_comm: True # Batch consecutive inter-peer send/recv operations. This argument is valid only when `virtual_pipeline_model_parallel_size` is larger than 1 num_query_groups: null # Number of query groups for group query attention. If None, normal attention is used. + ## Using Megatron Core + mcore_gpt: True + ## Transformer Engine # fp8 training is currently not supported in the improved models transformer_engine: False diff --git a/launcher_scripts/conf/training/gpt3/20b.yaml b/launcher_scripts/conf/training/gpt3/20b.yaml index d759cfdbe7..d54a2dc0ed 100755 --- a/launcher_scripts/conf/training/gpt3/20b.yaml +++ b/launcher_scripts/conf/training/gpt3/20b.yaml @@ -117,10 +117,13 @@ model: megatron_amp_O2: True # Enable O2-level automatic mixed precision using master parameters grad_allreduce_chunk_size_mb: 125 + ## Using Megatron Core + mcore_gpt: True + ## Transformer Engine # To use fp8, please set `transformer_engine=True` and `fp8=True`. # The rest of fp8 knobs are set for the fp8 training mode, which are ignored in non-fp8 training - transformer_engine: True + transformer_engine: False fp8: False # enables fp8 in TransformerLayer forward fp8_e4m3: False # sets fp8_format = recipe.Format.E4M3 fp8_hybrid: True # sets fp8_format = recipe.Format.HYBRID diff --git a/launcher_scripts/conf/training/gpt3/400m_improved.yaml b/launcher_scripts/conf/training/gpt3/400m_improved.yaml index 78cd1bc9da..6385fa7526 100644 --- a/launcher_scripts/conf/training/gpt3/400m_improved.yaml +++ b/launcher_scripts/conf/training/gpt3/400m_improved.yaml @@ -125,6 +125,9 @@ model: batch_p2p_comm: True # Batch consecutive inter-peer send/recv operations. This argument is valid only when `virtual_pipeline_model_parallel_size` is larger than 1 num_query_groups: null # Number of query groups for group query attention. If None, normal attention is used. + ## Using Megatron Core + mcore_gpt: True + ## Transformer Engine # fp8 training is currently not supported in the improved models transformer_engine: False diff --git a/launcher_scripts/conf/training/gpt3/40b.yaml b/launcher_scripts/conf/training/gpt3/40b.yaml index 24bf2a1494..8a4a562d5a 100755 --- a/launcher_scripts/conf/training/gpt3/40b.yaml +++ b/launcher_scripts/conf/training/gpt3/40b.yaml @@ -117,10 +117,13 @@ model: megatron_amp_O2: True # Enable O2-level automatic mixed precision using master parameters grad_allreduce_chunk_size_mb: 125 + ## Using Megatron Core + mcore_gpt: True + ## Transformer Engine # To use fp8, please set `transformer_engine=True` and `fp8=True`. # The rest of fp8 knobs are set for the fp8 training mode, which are ignored in non-fp8 training - transformer_engine: True + transformer_engine: False fp8: False # enables fp8 in TransformerLayer forward fp8_e4m3: False # sets fp8_format = recipe.Format.E4M3 fp8_hybrid: True # sets fp8_format = recipe.Format.HYBRID diff --git a/launcher_scripts/conf/training/gpt3/40b_improved.yaml b/launcher_scripts/conf/training/gpt3/40b_improved.yaml index 62b3f4e3f0..57c39b899a 100644 --- a/launcher_scripts/conf/training/gpt3/40b_improved.yaml +++ b/launcher_scripts/conf/training/gpt3/40b_improved.yaml @@ -125,6 +125,9 @@ model: batch_p2p_comm: True # Batch consecutive inter-peer send/recv operations. This argument is valid only when `virtual_pipeline_model_parallel_size` is larger than 1 num_query_groups: null # Number of query groups for group query attention. If None, normal attention is used. + ## Using Megatron Core + mcore_gpt: True + ## Transformer Engine # fp8 training is currently not supported in the improved models transformer_engine: False diff --git a/launcher_scripts/conf/training/gpt3/5b.yaml b/launcher_scripts/conf/training/gpt3/5b.yaml index f42e472bca..76a79d85d1 100755 --- a/launcher_scripts/conf/training/gpt3/5b.yaml +++ b/launcher_scripts/conf/training/gpt3/5b.yaml @@ -117,10 +117,13 @@ model: megatron_amp_O2: True # Enable O2-level automatic mixed precision using master parameters grad_allreduce_chunk_size_mb: 125 + ## Using Megatron Core + mcore_gpt: True + ## Transformer Engine # To use fp8, please set `transformer_engine=True` and `fp8=True`. # The rest of fp8 knobs are set for the fp8 training mode, which are ignored in non-fp8 training - transformer_engine: True + transformer_engine: False fp8: False # enables fp8 in TransformerLayer forward fp8_e4m3: False # sets fp8_format = recipe.Format.E4M3 fp8_hybrid: True # sets fp8_format = recipe.Format.HYBRID diff --git a/launcher_scripts/conf/training/gpt3/7b_improved.yaml b/launcher_scripts/conf/training/gpt3/7b_improved.yaml index 7a2d23ad03..ecfba32d93 100644 --- a/launcher_scripts/conf/training/gpt3/7b_improved.yaml +++ b/launcher_scripts/conf/training/gpt3/7b_improved.yaml @@ -125,6 +125,9 @@ model: batch_p2p_comm: True # Batch consecutive inter-peer send/recv operations. This argument is valid only when `virtual_pipeline_model_parallel_size` is larger than 1 num_query_groups: null # Number of query groups for group query attention. If None, normal attention is used. + ## Using Megatron Core + mcore_gpt: True + ## Transformer Engine # fp8 training is currently not supported in the improved models transformer_engine: False From fc7575e17dbbf3d5579a69260940be2264a664b6 Mon Sep 17 00:00:00 2001 From: David Date: Mon, 11 Sep 2023 22:02:24 -0700 Subject: [PATCH 52/62] updatiing GPT conversion (#125) Signed-off-by: David Mosallanezhad --- launcher_scripts/conf/conversion/gpt3/convert_gpt3.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/launcher_scripts/conf/conversion/gpt3/convert_gpt3.yaml b/launcher_scripts/conf/conversion/gpt3/convert_gpt3.yaml index dc8d43b44d..2b80dd31aa 100755 --- a/launcher_scripts/conf/conversion/gpt3/convert_gpt3.yaml +++ b/launcher_scripts/conf/conversion/gpt3/convert_gpt3.yaml @@ -13,7 +13,7 @@ run: model: model_type: gpt # gpt or t5, use t5 for mt5 as well checkpoint_folder: ${conversion.run.train_dir}/results/checkpoints - checkpoint_name: latest # latest OR name pattern of a checkpoint (e.g. megatron_gpt-*last.ckpt) + checkpoint_name: megatron_gpt-*last # latest OR name pattern of a checkpoint (e.g. megatron_gpt-*last.ckpt) hparams_file: ${conversion.run.train_dir}/results/hparams.yaml tensor_model_parallel_size: 1 pipeline_model_parallel_size: 1 From 68ae39f9a65a01c2655f26e5d879a9333c795403 Mon Sep 17 00:00:00 2001 From: Sangkug Lym Date: Tue, 12 Sep 2023 12:03:46 -0700 Subject: [PATCH 53/62] Set ub_tp_overlap to False, change 175b_fp8 to use TP=4, MBS=2 Signed-off-by: Sangkug Lym --- launcher_scripts/conf/training/gpt3/175b.yaml | 2 +- .../conf/training/gpt3/175b_fp8.yaml | 246 ++++++++++++++++++ launcher_scripts/conf/training/gpt3/20b.yaml | 2 +- launcher_scripts/conf/training/gpt3/40b.yaml | 2 +- 4 files changed, 249 insertions(+), 3 deletions(-) create mode 100755 launcher_scripts/conf/training/gpt3/175b_fp8.yaml diff --git a/launcher_scripts/conf/training/gpt3/175b.yaml b/launcher_scripts/conf/training/gpt3/175b.yaml index df8049ba77..62064a4d84 100755 --- a/launcher_scripts/conf/training/gpt3/175b.yaml +++ b/launcher_scripts/conf/training/gpt3/175b.yaml @@ -133,7 +133,7 @@ model: fp8_amax_compute_algo: max # 'most_recent' or 'max'. Algorithm for computing amax from history fp8_wgrad: True use_emha: False - ub_tp_comm_overlap: True + ub_tp_comm_overlap: False # miscellaneous seed: 1234 diff --git a/launcher_scripts/conf/training/gpt3/175b_fp8.yaml b/launcher_scripts/conf/training/gpt3/175b_fp8.yaml new file mode 100755 index 0000000000..4bea128403 --- /dev/null +++ b/launcher_scripts/conf/training/gpt3/175b_fp8.yaml @@ -0,0 +1,246 @@ +# The configurations below provide the best 175B training performance with the NeMo SW stack. +# We have confirmed the model convergence only with a limited number of tokens and the full model +# convergence (e.g., 300B tokens) is not guaranteed. +hydra: + searchpath: + - file:///opt/NeMo/examples/nlp/language_modeling/conf + +run: + name: gpt3_175b + results_dir: ${base_results_dir}/${.name} + time_limit: "26-00:00:00" + dependency: "singleton" + +trainer: + num_nodes: 128 + devices: 8 + accelerator: gpu + precision: bf16 + logger: False # logger provided by exp_manager + enable_checkpointing: False + use_distributed_sampler: False + max_epochs: null + max_steps: 75000 # consumed_samples = global_step * global_batch_size + max_time: "25:23:00:00" + log_every_n_steps: 10 + val_check_interval: 2000 + limit_val_batches: 20 + limit_test_batches: 20 + accumulate_grad_batches: 1 + gradient_clip_val: 1.0 + +exp_manager: + explicit_log_dir: ${training.run.results_dir}/results + exp_dir: null + name: megatron_gpt + create_wandb_logger: False + wandb_logger_kwargs: + project: nemo_gpt3 + name: ${training.run.name} + resume_if_exists: True + resume_ignore_no_checkpoint: True + create_checkpoint_callback: True + checkpoint_callback_params: + monitor: val_loss + save_top_k: 5 + mode: min + always_save_nemo: False # saves nemo file during validation, not implemented for model parallel + save_nemo_on_train_end: False # not recommended when training large models on clusters with short time limits + filename: 'megatron_gpt--{val_loss:.2f}-{step}-{consumed_samples}' + model_parallel_size: ${multiply:${training.model.tensor_model_parallel_size}, ${training.model.pipeline_model_parallel_size}} + log_step_timing: True + step_timing_kwargs: + sync_cuda: True + buffer_size: 5 + +model: + micro_batch_size: 2 + global_batch_size: 2048 + tensor_model_parallel_size: 8 + pipeline_model_parallel_size: 8 + virtual_pipeline_model_parallel_size: 12 # interleaved pipeline, set to maximum + resume_from_checkpoint: null # manually set the checkpoint file to load from + # model architecture + encoder_seq_length: 2048 + max_position_embeddings: 2048 + num_layers: 96 + hidden_size: 12288 + ffn_hidden_size: ${multiply:4, ${.hidden_size}} # Transformer FFN hidden size. 4 * hidden_size. + num_attention_heads: 96 + init_method_std: 0.006 # Standard deviation of the zero mean normal distribution used for weight initialization.') + hidden_dropout: 0.1 # Dropout probability for hidden state transformer. + attention_dropout: 0.1 + kv_channels: null # Projection weights dimension in multi-head attention. Set to hidden_size // num_attention_heads if null + apply_query_key_layer_scaling: True # scale Q * K^T by 1 / layer-number. + layernorm_epsilon: 1e-5 + make_vocab_size_divisible_by: 128 # Pad the vocab size to be divisible by this value for computation efficiency. + pre_process: True # add embedding + post_process: True # add pooler + persist_layer_norm: True # Use of persistent fused layer norm kernel. + gradient_as_bucket_view: True # Allocate gradients in a contiguous bucket to save memory (less fragmentation and buffer memory) + + # Fusion + grad_div_ar_fusion: True # Fuse grad division into torch.distributed.all_reduce + gradient_accumulation_fusion: True # Fuse weight gradient accumulation to GEMMs + bias_activation_fusion: True # Use a kernel that fuses the bias addition from weight matrices with the subsequent activation function. + bias_dropout_add_fusion: True # Use a kernel that fuses the bias addition, dropout and residual connection addition. + masked_softmax_fusion: True # Use a kernel that fuses the attention softmax with it's mask. + + ## Activation Checkpointing + activations_checkpoint_granularity: null # 'selective' or 'full' + activations_checkpoint_method: null # 'uniform', 'block' + activations_checkpoint_num_layers: null + num_micro_batches_with_partial_activation_checkpoints: null + activations_checkpoint_layers_per_pipeline: null + + ## Sequence Parallelism + sequence_parallel: True + + num_query_groups: null # Number of query groups for group query attention. If None, normal attention is used. + + tokenizer: + library: 'megatron' + type: 'GPT2BPETokenizer' + model: null + delimiter: null # only used for tabular tokenizer + vocab_file: ${data_dir}/bpe/vocab.json + merge_file: ${data_dir}/bpe/merges.txt + + # precision + native_amp_init_scale: 4294967296 # 2 ** 32 + native_amp_growth_interval: 1000 + hysteresis: 2 # Gradient scale hysteresis + fp32_residual_connection: False # Move residual connections to fp32 + fp16_lm_cross_entropy: False # Move the cross entropy unreduced loss calculation for lm head to fp16 + + # Megatron O2-style half-precision + megatron_amp_O2: True # Enable O2-level automatic mixed precision using master parameters + grad_allreduce_chunk_size_mb: 125 + + ## Using Megatron Core + mcore_gpt: True + + ## Transformer Engine + # To use fp8, please set `transformer_engine=True` and `fp8=True`. + # The rest of fp8 knobs are set for the fp8 training mode, which are ignored in non-fp8 training + transformer_engine: False + fp8: True # enables fp8 in TransformerLayer forward + fp8_e4m3: False # sets fp8_format = recipe.Format.E4M3 + fp8_hybrid: True # sets fp8_format = recipe.Format.HYBRID + fp8_margin: 0 # scaling margin + fp8_interval: 1 # scaling update interval + fp8_amax_history_len: 1024 # Number of steps for which amax history is recorded per tensor + fp8_amax_compute_algo: max # 'most_recent' or 'max'. Algorithm for computing amax from history + fp8_wgrad: True + use_emha: False + ub_tp_comm_overlap: False + + # miscellaneous + seed: 1234 + sync_batch_comm: False + use_cpu_initialization: False # Init weights on the CPU (slow for large models) + onnx_safe: False # Use work-arounds for known problems with Torch ONNX exporter. + apex_transformer_log_level: 30 # Python logging level displays logs with severity greater than or equal to this + overlap_p2p_comm: True # Overlap p2p communication with computes + batch_p2p_comm: False # Batch consecutive inter-peer send/recv operations + + # Nsys profiling options + nsys_profile: + enabled: False + trace: [nvtx,cuda] + start_step: 10 # Global batch to start profiling + end_step: 10 # Global batch to end profiling + ranks: [0] # Global rank IDs to profile + gen_shape: False # Generate model and kernel details including input shapes + + optim: + name: distributed_fused_adam + bucket_cap_mb: 100 + overlap_grad_sync: True + overlap_param_sync: true + contiguous_grad_buffer: True + grad_sync_dtype: bf16 + lr: 0.9e-4 + weight_decay: 0.1 + betas: + - 0.9 + - 0.95 + sched: + name: CosineAnnealing + warmup_steps: 115 + constant_steps: 12500 + min_lr: 0.9e-5 + + data: + data_impl: mmap + splits_string: "99990,8,2" + seq_length: 2048 + skip_warmup: True + num_workers: 2 + dataloader_type: single # cyclic + reset_position_ids: False # Reset position ids after end-of-document token + reset_attention_mask: False # Reset attention mask after end-of-document token + eod_mask_loss: False # Mask loss for the end of document tokens + index_mapping_dir: null # path to save index mapping .npy files, by default will save in the same location as data_prefix + data_prefix: # Should be weight path weight path... for a blended dataset + - .0333 + - ${data_dir}/my-gpt3_00_text_document + - .0333 + - ${data_dir}/my-gpt3_01_text_document + - .0333 + - ${data_dir}/my-gpt3_02_text_document + - .0333 + - ${data_dir}/my-gpt3_03_text_document + - .0333 + - ${data_dir}/my-gpt3_04_text_document + - .0333 + - ${data_dir}/my-gpt3_05_text_document + - .0333 + - ${data_dir}/my-gpt3_06_text_document + - .0333 + - ${data_dir}/my-gpt3_07_text_document + - .0333 + - ${data_dir}/my-gpt3_08_text_document + - .0333 + - ${data_dir}/my-gpt3_09_text_document + - .0333 + - ${data_dir}/my-gpt3_10_text_document + - .0333 + - ${data_dir}/my-gpt3_11_text_document + - .0333 + - ${data_dir}/my-gpt3_12_text_document + - .0333 + - ${data_dir}/my-gpt3_13_text_document + - .0333 + - ${data_dir}/my-gpt3_14_text_document + - .0333 + - ${data_dir}/my-gpt3_15_text_document + - .0333 + - ${data_dir}/my-gpt3_16_text_document + - .0333 + - ${data_dir}/my-gpt3_17_text_document + - .0333 + - ${data_dir}/my-gpt3_18_text_document + - .0333 + - ${data_dir}/my-gpt3_19_text_document + - .0333 + - ${data_dir}/my-gpt3_20_text_document + - .0333 + - ${data_dir}/my-gpt3_21_text_document + - .0333 + - ${data_dir}/my-gpt3_22_text_document + - .0333 + - ${data_dir}/my-gpt3_23_text_document + - .0333 + - ${data_dir}/my-gpt3_24_text_document + - .0333 + - ${data_dir}/my-gpt3_25_text_document + - .0333 + - ${data_dir}/my-gpt3_26_text_document + - .0333 + - ${data_dir}/my-gpt3_27_text_document + - .0333 + - ${data_dir}/my-gpt3_28_text_document + - .0334 + - ${data_dir}/my-gpt3_29_text_document + diff --git a/launcher_scripts/conf/training/gpt3/20b.yaml b/launcher_scripts/conf/training/gpt3/20b.yaml index d54a2dc0ed..092c52f9ae 100755 --- a/launcher_scripts/conf/training/gpt3/20b.yaml +++ b/launcher_scripts/conf/training/gpt3/20b.yaml @@ -133,7 +133,7 @@ model: fp8_amax_compute_algo: max # 'most_recent' or 'max'. Algorithm for computing amax from history fp8_wgrad: True use_emha: False - ub_tp_comm_overlap: True + ub_tp_comm_overlap: False # miscellaneous seed: 1234 diff --git a/launcher_scripts/conf/training/gpt3/40b.yaml b/launcher_scripts/conf/training/gpt3/40b.yaml index 8a4a562d5a..d4dfa3e928 100755 --- a/launcher_scripts/conf/training/gpt3/40b.yaml +++ b/launcher_scripts/conf/training/gpt3/40b.yaml @@ -133,7 +133,7 @@ model: fp8_amax_compute_algo: max # 'most_recent' or 'max'. Algorithm for computing amax from history fp8_wgrad: True use_emha: False - ub_tp_comm_overlap: True + ub_tp_comm_overlap: False # miscellaneous seed: 1234 From 8f2aceddd74d7871a30a442e44dacc077419e4fe Mon Sep 17 00:00:00 2001 From: Sangkug Lym Date: Tue, 12 Sep 2023 12:15:05 -0700 Subject: [PATCH 54/62] set NCCL_NVLS_ENABLE=0 for memory saving Signed-off-by: Sangkug Lym --- launcher_scripts/conf/config.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/launcher_scripts/conf/config.yaml b/launcher_scripts/conf/config.yaml index 30d553059a..80267d5a56 100755 --- a/launcher_scripts/conf/config.yaml +++ b/launcher_scripts/conf/config.yaml @@ -52,6 +52,7 @@ env_vars: NCCL_PROTO: null # Protocol NCCL will use. Set to "simple" for AWS TRANSFORMERS_OFFLINE: 1 TORCH_NCCL_AVOID_RECORD_STREAMS: 1 + NCCL_NVLS_ENABLE: 0 # GPU Mapping numa_mapping: From 4dbf28730cc1ef031f0af14190829afebc88c497 Mon Sep 17 00:00:00 2001 From: Maanu Grover Date: Tue, 12 Sep 2023 17:43:37 -0700 Subject: [PATCH 55/62] support distributed checkpointing in checkpoint search Signed-off-by: Maanu Grover --- .../collections/checkpoint_search.py | 26 ++++++++++++++----- 1 file changed, 20 insertions(+), 6 deletions(-) diff --git a/launcher_scripts/nemo_launcher/collections/checkpoint_search.py b/launcher_scripts/nemo_launcher/collections/checkpoint_search.py index 4da89ed860..6bdd7d5101 100755 --- a/launcher_scripts/nemo_launcher/collections/checkpoint_search.py +++ b/launcher_scripts/nemo_launcher/collections/checkpoint_search.py @@ -51,16 +51,30 @@ def checkpoint_search(cfg): pipeline_model_parallel_size = cfg.pipeline_model_parallel_size if checkpoint_name == "latest": - checkpoints = os.path.join(checkpoint_folder, "*.ckpt") - checkpoints = _inject_model_parallel_rank( - checkpoints, tensor_model_parallel_size, pipeline_model_parallel_size - ) - checkpoint_list = glob.glob(checkpoints) + + dist_ckpt = False + # Every distributed checkpoint saves a 'common.pt' file + for result in glob.glob(os.path.join(checkpoint_folder, "*")): + if os.path.exists(os.path.join(result, 'common.pt')): + dist_ckpt = True + break + + if dist_ckpt: + checkpoint_list = [f for f in glob.glob(os.path.join(checkpoint_folder, "*")) if os.path.isdir(f)] + else: + checkpoints = os.path.join(checkpoint_folder, "*.ckpt") + + checkpoints = _inject_model_parallel_rank( + checkpoints, tensor_model_parallel_size, pipeline_model_parallel_size + ) + checkpoint_list = glob.glob(checkpoints) + latest_checkpoint = max(checkpoint_list, key=os.path.getctime) checkpoint_name = os.path.basename(latest_checkpoint) checkpoint = os.path.join(checkpoint_folder, checkpoint_name) - checkpoint = _inject_model_parallel_rank(checkpoint, tensor_model_parallel_size, pipeline_model_parallel_size) + if not dist_ckpt: + checkpoint = _inject_model_parallel_rank(checkpoint, tensor_model_parallel_size, pipeline_model_parallel_size) checkpoint_list = glob.glob(checkpoint) if len(checkpoint_list) > 1: raise ValueError("Too many checkpoints fit the checkpoint name pattern in conversion config.") From 10e03e042e65995e00c75488d5327c644fff838f Mon Sep 17 00:00:00 2001 From: Sangkug Lym Date: Tue, 12 Sep 2023 18:20:19 -0700 Subject: [PATCH 56/62] clearn up Signed-off-by: Sangkug Lym --- launcher_scripts/conf/training/gpt3/126m.yaml | 1 - launcher_scripts/conf/training/gpt3/175b.yaml | 1 - .../conf/training/gpt3/175b_fp8.yaml | 1 - .../conf/training/gpt3/175b_performance.yaml | 247 ------------------ .../conf/training/gpt3/1b_improved.yaml | 1 - launcher_scripts/conf/training/gpt3/20b.yaml | 1 - .../conf/training/gpt3/400m_improved.yaml | 1 - launcher_scripts/conf/training/gpt3/40b.yaml | 1 - .../conf/training/gpt3/40b_improved.yaml | 1 - launcher_scripts/conf/training/gpt3/5b.yaml | 1 - .../conf/training/gpt3/7b_improved.yaml | 1 - ...b_cfg_a100_h12288_tp4_mbs1_seqlen2048.yaml | 53 ---- ...b_cfg_a100_h12288_tp4_mbs2_seqlen2048.yaml | 53 ---- ...b_cfg_h100_h12288_tp4_mbs1_seqlen2048.yaml | 59 ----- ...b_cfg_h100_h12288_tp8_mbs2_seqlen2048.yaml | 59 ----- 15 files changed, 481 deletions(-) delete mode 100755 launcher_scripts/conf/training/gpt3/175b_performance.yaml delete mode 100644 launcher_scripts/conf/training/gpt3/ub-confs/ub_cfg_a100_h12288_tp4_mbs1_seqlen2048.yaml delete mode 100644 launcher_scripts/conf/training/gpt3/ub-confs/ub_cfg_a100_h12288_tp4_mbs2_seqlen2048.yaml delete mode 100644 launcher_scripts/conf/training/gpt3/ub-confs/ub_cfg_h100_h12288_tp4_mbs1_seqlen2048.yaml delete mode 100644 launcher_scripts/conf/training/gpt3/ub-confs/ub_cfg_h100_h12288_tp8_mbs2_seqlen2048.yaml diff --git a/launcher_scripts/conf/training/gpt3/126m.yaml b/launcher_scripts/conf/training/gpt3/126m.yaml index bd719118af..27d3329756 100755 --- a/launcher_scripts/conf/training/gpt3/126m.yaml +++ b/launcher_scripts/conf/training/gpt3/126m.yaml @@ -133,7 +133,6 @@ model: fp8_amax_history_len: 1024 # Number of steps for which amax history is recorded per tensor fp8_amax_compute_algo: max # 'most_recent' or 'max'. Algorithm for computing amax from history fp8_wgrad: True - use_emha: False ub_tp_comm_overlap: False # miscellaneous diff --git a/launcher_scripts/conf/training/gpt3/175b.yaml b/launcher_scripts/conf/training/gpt3/175b.yaml index 62064a4d84..33125cbb9f 100755 --- a/launcher_scripts/conf/training/gpt3/175b.yaml +++ b/launcher_scripts/conf/training/gpt3/175b.yaml @@ -132,7 +132,6 @@ model: fp8_amax_history_len: 1024 # Number of steps for which amax history is recorded per tensor fp8_amax_compute_algo: max # 'most_recent' or 'max'. Algorithm for computing amax from history fp8_wgrad: True - use_emha: False ub_tp_comm_overlap: False # miscellaneous diff --git a/launcher_scripts/conf/training/gpt3/175b_fp8.yaml b/launcher_scripts/conf/training/gpt3/175b_fp8.yaml index 4bea128403..f58e86cbd9 100755 --- a/launcher_scripts/conf/training/gpt3/175b_fp8.yaml +++ b/launcher_scripts/conf/training/gpt3/175b_fp8.yaml @@ -132,7 +132,6 @@ model: fp8_amax_history_len: 1024 # Number of steps for which amax history is recorded per tensor fp8_amax_compute_algo: max # 'most_recent' or 'max'. Algorithm for computing amax from history fp8_wgrad: True - use_emha: False ub_tp_comm_overlap: False # miscellaneous diff --git a/launcher_scripts/conf/training/gpt3/175b_performance.yaml b/launcher_scripts/conf/training/gpt3/175b_performance.yaml deleted file mode 100755 index 7c0d55b37f..0000000000 --- a/launcher_scripts/conf/training/gpt3/175b_performance.yaml +++ /dev/null @@ -1,247 +0,0 @@ -# The configurations below provide the best 175B training performance with the NeMo SW stack. -# We have confirmed the model convergence only with a limited number of tokens and the full model -# convergence (e.g., 300B tokens) is not guaranteed. -hydra: - searchpath: - - file:///opt/NeMo/examples/nlp/language_modeling/conf - -run: - name: gpt3_175b - results_dir: ${base_results_dir}/${.name} - time_limit: "26-00:00:00" - dependency: "singleton" - -trainer: - num_nodes: 128 - devices: 8 - accelerator: gpu - precision: bf16 - logger: False # logger provided by exp_manager - enable_checkpointing: False - use_distributed_sampler: False - max_epochs: null - max_steps: 75000 # consumed_samples = global_step * global_batch_size - max_time: "25:23:00:00" - log_every_n_steps: 10 - val_check_interval: 2000 - limit_val_batches: 20 - limit_test_batches: 20 - accumulate_grad_batches: 1 - gradient_clip_val: 1.0 - -exp_manager: - explicit_log_dir: ${training.run.results_dir}/results - exp_dir: null - name: megatron_gpt - create_wandb_logger: False - wandb_logger_kwargs: - project: nemo_gpt3 - name: ${training.run.name} - resume_if_exists: True - resume_ignore_no_checkpoint: True - create_checkpoint_callback: True - checkpoint_callback_params: - monitor: val_loss - save_top_k: 5 - mode: min - always_save_nemo: False # saves nemo file during validation, not implemented for model parallel - save_nemo_on_train_end: False # not recommended when training large models on clusters with short time limits - filename: 'megatron_gpt--{val_loss:.2f}-{step}-{consumed_samples}' - model_parallel_size: ${multiply:${training.model.tensor_model_parallel_size}, ${training.model.pipeline_model_parallel_size}} - log_step_timing: True - step_timing_kwargs: - sync_cuda: True - buffer_size: 5 - -model: - micro_batch_size: 1 - global_batch_size: 2048 - tensor_model_parallel_size: 4 - pipeline_model_parallel_size: 8 - virtual_pipeline_model_parallel_size: 12 # interleaved pipeline, set to maximum - resume_from_checkpoint: null # manually set the checkpoint file to load from - # model architecture - encoder_seq_length: 2048 - max_position_embeddings: 2048 - num_layers: 96 - hidden_size: 12288 - ffn_hidden_size: ${multiply:4, ${.hidden_size}} # Transformer FFN hidden size. 4 * hidden_size. - num_attention_heads: 96 - init_method_std: 0.006 # Standard deviation of the zero mean normal distribution used for weight initialization.') - hidden_dropout: 0.1 # Dropout probability for hidden state transformer. - attention_dropout: 0.1 - kv_channels: null # Projection weights dimension in multi-head attention. Set to hidden_size // num_attention_heads if null - apply_query_key_layer_scaling: True # scale Q * K^T by 1 / layer-number. - layernorm_epsilon: 1e-5 - make_vocab_size_divisible_by: 128 # Pad the vocab size to be divisible by this value for computation efficiency. - pre_process: True # add embedding - post_process: True # add pooler - persist_layer_norm: True # Use of persistent fused layer norm kernel. - gradient_as_bucket_view: True # Allocate gradients in a contiguous bucket to save memory (less fragmentation and buffer memory) - - # Fusion - grad_div_ar_fusion: True # Fuse grad division into torch.distributed.all_reduce - gradient_accumulation_fusion: True # Fuse weight gradient accumulation to GEMMs - bias_activation_fusion: True # Use a kernel that fuses the bias addition from weight matrices with the subsequent activation function. - bias_dropout_add_fusion: True # Use a kernel that fuses the bias addition, dropout and residual connection addition. - masked_softmax_fusion: True # Use a kernel that fuses the attention softmax with it's mask. - - ## Activation Checkpointing - activations_checkpoint_granularity: null # 'selective' or 'full' - activations_checkpoint_method: null # 'uniform', 'block' - activations_checkpoint_num_layers: null - num_micro_batches_with_partial_activation_checkpoints: null - activations_checkpoint_layers_per_pipeline: null - - ## Sequence Parallelism - sequence_parallel: True - - num_query_groups: null # Number of query groups for group query attention. If None, normal attention is used. - - tokenizer: - library: 'megatron' - type: 'GPT2BPETokenizer' - model: null - delimiter: null # only used for tabular tokenizer - vocab_file: ${data_dir}/bpe/vocab.json - merge_file: ${data_dir}/bpe/merges.txt - - # precision - native_amp_init_scale: 4294967296 # 2 ** 32 - native_amp_growth_interval: 1000 - hysteresis: 2 # Gradient scale hysteresis - fp32_residual_connection: False # Move residual connections to fp32 - fp16_lm_cross_entropy: False # Move the cross entropy unreduced loss calculation for lm head to fp16 - - # Megatron O2-style half-precision - megatron_amp_O2: True # Enable O2-level automatic mixed precision using master parameters - grad_allreduce_chunk_size_mb: 125 - - ## Using Megatron Core - mcore_gpt: True - - ## Transformer Engine - # To use fp8, please set `transformer_engine=True` and `fp8=True`. - # The rest of fp8 knobs are set for the fp8 training mode, which are ignored in non-fp8 training - transformer_engine: False - fp8: False # enables fp8 in TransformerLayer forward - fp8_e4m3: False # sets fp8_format = recipe.Format.E4M3 - fp8_hybrid: True # sets fp8_format = recipe.Format.HYBRID - fp8_margin: 0 # scaling margin - fp8_interval: 1 # scaling update interval - fp8_amax_history_len: 1024 # Number of steps for which amax history is recorded per tensor - fp8_amax_compute_algo: max # 'most_recent' or 'max'. Algorithm for computing amax from history - fp8_wgrad: True - use_emha: False - ub_tp_comm_overlap: True - - # miscellaneous - seed: 1234 - sync_batch_comm: False - use_cpu_initialization: False # Init weights on the CPU (slow for large models) - onnx_safe: False # Use work-arounds for known problems with Torch ONNX exporter. - apex_transformer_log_level: 30 # Python logging level displays logs with severity greater than or equal to this - overlap_p2p_comm: True # Overlap p2p communication with computes - batch_p2p_comm: False # Batch consecutive inter-peer send/recv operations - gc_interval: 100 # Interval of the host memory garbage collection - - # Nsys profiling options - nsys_profile: - enabled: False - trace: [nvtx,cuda] - start_step: 10 # Global batch to start profiling - end_step: 10 # Global batch to end profiling - ranks: [0] # Global rank IDs to profile - gen_shape: False # Generate model and kernel details including input shapes - - optim: - name: distributed_fused_adam - bucket_cap_mb: 100 - overlap_grad_sync: True - overlap_param_sync: true - contiguous_grad_buffer: True - grad_sync_dtype: bf16 - lr: 0.9e-4 - weight_decay: 0.1 - betas: - - 0.9 - - 0.95 - sched: - name: CosineAnnealing - warmup_steps: 115 - constant_steps: 12500 - min_lr: 0.9e-5 - - data: - data_impl: mmap - splits_string: "99990,8,2" - seq_length: 2048 - skip_warmup: True - num_workers: 2 - dataloader_type: single # cyclic - reset_position_ids: False # Reset position ids after end-of-document token - reset_attention_mask: False # Reset attention mask after end-of-document token - eod_mask_loss: False # Mask loss for the end of document tokens - index_mapping_dir: null # path to save index mapping .npy files, by default will save in the same location as data_prefix - data_prefix: # Should be weight path weight path... for a blended dataset - - .0333 - - ${data_dir}/my-gpt3_00_text_document - - .0333 - - ${data_dir}/my-gpt3_01_text_document - - .0333 - - ${data_dir}/my-gpt3_02_text_document - - .0333 - - ${data_dir}/my-gpt3_03_text_document - - .0333 - - ${data_dir}/my-gpt3_04_text_document - - .0333 - - ${data_dir}/my-gpt3_05_text_document - - .0333 - - ${data_dir}/my-gpt3_06_text_document - - .0333 - - ${data_dir}/my-gpt3_07_text_document - - .0333 - - ${data_dir}/my-gpt3_08_text_document - - .0333 - - ${data_dir}/my-gpt3_09_text_document - - .0333 - - ${data_dir}/my-gpt3_10_text_document - - .0333 - - ${data_dir}/my-gpt3_11_text_document - - .0333 - - ${data_dir}/my-gpt3_12_text_document - - .0333 - - ${data_dir}/my-gpt3_13_text_document - - .0333 - - ${data_dir}/my-gpt3_14_text_document - - .0333 - - ${data_dir}/my-gpt3_15_text_document - - .0333 - - ${data_dir}/my-gpt3_16_text_document - - .0333 - - ${data_dir}/my-gpt3_17_text_document - - .0333 - - ${data_dir}/my-gpt3_18_text_document - - .0333 - - ${data_dir}/my-gpt3_19_text_document - - .0333 - - ${data_dir}/my-gpt3_20_text_document - - .0333 - - ${data_dir}/my-gpt3_21_text_document - - .0333 - - ${data_dir}/my-gpt3_22_text_document - - .0333 - - ${data_dir}/my-gpt3_23_text_document - - .0333 - - ${data_dir}/my-gpt3_24_text_document - - .0333 - - ${data_dir}/my-gpt3_25_text_document - - .0333 - - ${data_dir}/my-gpt3_26_text_document - - .0333 - - ${data_dir}/my-gpt3_27_text_document - - .0333 - - ${data_dir}/my-gpt3_28_text_document - - .0334 - - ${data_dir}/my-gpt3_29_text_document - diff --git a/launcher_scripts/conf/training/gpt3/1b_improved.yaml b/launcher_scripts/conf/training/gpt3/1b_improved.yaml index 173497fc55..0b0c73421f 100644 --- a/launcher_scripts/conf/training/gpt3/1b_improved.yaml +++ b/launcher_scripts/conf/training/gpt3/1b_improved.yaml @@ -139,7 +139,6 @@ model: fp8_amax_history_len: 1024 # Number of steps for which amax history is recorded per tensor fp8_amax_compute_algo: max # 'most_recent' or 'max'. Algorithm for computing amax from history fp8_wgrad: True - use_emha: False ub_tp_comm_overlap: False optim: diff --git a/launcher_scripts/conf/training/gpt3/20b.yaml b/launcher_scripts/conf/training/gpt3/20b.yaml index 092c52f9ae..f748e32bd4 100755 --- a/launcher_scripts/conf/training/gpt3/20b.yaml +++ b/launcher_scripts/conf/training/gpt3/20b.yaml @@ -132,7 +132,6 @@ model: fp8_amax_history_len: 1024 # Number of steps for which amax history is recorded per tensor fp8_amax_compute_algo: max # 'most_recent' or 'max'. Algorithm for computing amax from history fp8_wgrad: True - use_emha: False ub_tp_comm_overlap: False # miscellaneous diff --git a/launcher_scripts/conf/training/gpt3/400m_improved.yaml b/launcher_scripts/conf/training/gpt3/400m_improved.yaml index 6385fa7526..f6cdbdea66 100644 --- a/launcher_scripts/conf/training/gpt3/400m_improved.yaml +++ b/launcher_scripts/conf/training/gpt3/400m_improved.yaml @@ -139,7 +139,6 @@ model: fp8_amax_history_len: 1024 # Number of steps for which amax history is recorded per tensor fp8_amax_compute_algo: max # 'most_recent' or 'max'. Algorithm for computing amax from history fp8_wgrad: True - use_emha: False ub_tp_comm_overlap: False optim: diff --git a/launcher_scripts/conf/training/gpt3/40b.yaml b/launcher_scripts/conf/training/gpt3/40b.yaml index d4dfa3e928..e689fe7d4e 100755 --- a/launcher_scripts/conf/training/gpt3/40b.yaml +++ b/launcher_scripts/conf/training/gpt3/40b.yaml @@ -132,7 +132,6 @@ model: fp8_amax_history_len: 1024 # Number of steps for which amax history is recorded per tensor fp8_amax_compute_algo: max # 'most_recent' or 'max'. Algorithm for computing amax from history fp8_wgrad: True - use_emha: False ub_tp_comm_overlap: False # miscellaneous diff --git a/launcher_scripts/conf/training/gpt3/40b_improved.yaml b/launcher_scripts/conf/training/gpt3/40b_improved.yaml index 57c39b899a..077d3cb5ee 100644 --- a/launcher_scripts/conf/training/gpt3/40b_improved.yaml +++ b/launcher_scripts/conf/training/gpt3/40b_improved.yaml @@ -139,7 +139,6 @@ model: fp8_amax_history_len: 1024 # Number of steps for which amax history is recorded per tensor fp8_amax_compute_algo: max # 'most_recent' or 'max'. Algorithm for computing amax from history fp8_wgrad: True - use_emha: False ub_tp_comm_overlap: False optim: diff --git a/launcher_scripts/conf/training/gpt3/5b.yaml b/launcher_scripts/conf/training/gpt3/5b.yaml index 76a79d85d1..388c052121 100755 --- a/launcher_scripts/conf/training/gpt3/5b.yaml +++ b/launcher_scripts/conf/training/gpt3/5b.yaml @@ -132,7 +132,6 @@ model: fp8_amax_history_len: 1024 # Number of steps for which amax history is recorded per tensor fp8_amax_compute_algo: max # 'most_recent' or 'max'. Algorithm for computing amax from history fp8_wgrad: True - use_emha: False ub_tp_comm_overlap: False # miscellaneous diff --git a/launcher_scripts/conf/training/gpt3/7b_improved.yaml b/launcher_scripts/conf/training/gpt3/7b_improved.yaml index ecfba32d93..9c3258b195 100644 --- a/launcher_scripts/conf/training/gpt3/7b_improved.yaml +++ b/launcher_scripts/conf/training/gpt3/7b_improved.yaml @@ -139,7 +139,6 @@ model: fp8_amax_history_len: 1024 # Number of steps for which amax history is recorded per tensor fp8_amax_compute_algo: max # 'most_recent' or 'max'. Algorithm for computing amax from history fp8_wgrad: True - use_emha: False ub_tp_comm_overlap: False optim: diff --git a/launcher_scripts/conf/training/gpt3/ub-confs/ub_cfg_a100_h12288_tp4_mbs1_seqlen2048.yaml b/launcher_scripts/conf/training/gpt3/ub-confs/ub_cfg_a100_h12288_tp4_mbs1_seqlen2048.yaml deleted file mode 100644 index 33bbffb7ce..0000000000 --- a/launcher_scripts/conf/training/gpt3/ub-confs/ub_cfg_a100_h12288_tp4_mbs1_seqlen2048.yaml +++ /dev/null @@ -1,53 +0,0 @@ -# UB communicator configurations -# Model configs: A100/175B/TP4/MBS1/SeqLen2K/BF16 - -# Bulk overlap with AllGather -qkv_dgrad: - method: bulk - num_sm: 2 - set_sm_margin: 0 - -qkv_wgrad: - method: bulk - num_sm: 2 - set_sm_margin: 0 - -fc1_dgrad: - method: bulk - num_sm: 2 - set_sm_margin: 0 - -fc1_wgrad: - method: bulk - num_sm: 2 - set_sm_margin: 0 - -## Ring-exchange overlap with AllGather -qkv_fprop: - method: ring_exchange - aggregate: 0 - -proj_dgrad: - method: ring_exchange - aggregate: 0 - -fc1_fprop: - method: ring_exchange - aggregate: 0 - -fc2_dgrad: - method: ring_exchange - aggregate: 0 - -# Chunked-collective overlap with ReduceScatter -proj_fprop: - method: pipeline - num_sm: 4 - num_splits: 4 - set_sm_margin: 0 - -fc2_fprop: - method: pipeline - num_sm: 4 - num_splits: 4 - set_sm_margin: 0 \ No newline at end of file diff --git a/launcher_scripts/conf/training/gpt3/ub-confs/ub_cfg_a100_h12288_tp4_mbs2_seqlen2048.yaml b/launcher_scripts/conf/training/gpt3/ub-confs/ub_cfg_a100_h12288_tp4_mbs2_seqlen2048.yaml deleted file mode 100644 index 434e0a29f4..0000000000 --- a/launcher_scripts/conf/training/gpt3/ub-confs/ub_cfg_a100_h12288_tp4_mbs2_seqlen2048.yaml +++ /dev/null @@ -1,53 +0,0 @@ -# UB communicator configurations -# Model configs: A100/175B/TP4/MBS2/SeqLen2K/BF16 - -# Bulk overlap with AllGather -qkv_dgrad: - method: bulk - num_sm: 2 - set_sm_margin: 0 - -qkv_wgrad: - method: bulk - num_sm: 2 - set_sm_margin: 0 - -fc1_dgrad: - method: bulk - num_sm: 2 - set_sm_margin: 0 - -fc1_wgrad: - method: bulk - num_sm: 2 - set_sm_margin: 0 - -## Ring-exchange overlap with AllGather -qkv_fprop: - method: ring_exchange - aggregate: 0 - -proj_dgrad: - method: ring_exchange - aggregate: 0 - -fc1_fprop: - method: ring_exchange - aggregate: 0 - -fc2_dgrad: - method: ring_exchange - aggregate: 0 - -# Chunked-collective overlap with ReduceScatter -proj_fprop: - method: pipeline - num_sm: 8 - num_splits: 4 - set_sm_margin: 0 - -fc2_fprop: - method: pipeline - num_sm: 4 - num_splits: 4 - set_sm_margin: 0 diff --git a/launcher_scripts/conf/training/gpt3/ub-confs/ub_cfg_h100_h12288_tp4_mbs1_seqlen2048.yaml b/launcher_scripts/conf/training/gpt3/ub-confs/ub_cfg_h100_h12288_tp4_mbs1_seqlen2048.yaml deleted file mode 100644 index 21d02f3dd2..0000000000 --- a/launcher_scripts/conf/training/gpt3/ub-confs/ub_cfg_h100_h12288_tp4_mbs1_seqlen2048.yaml +++ /dev/null @@ -1,59 +0,0 @@ -# UB communicator configurations -# Model configs: H100/175B/TP4/MBS1/SeqLen2K/FP8 - -# Bulk overlap with AllGather / ReduceScatter -qkv_dgrad: - method: bulk - num_sm: 4 - cga_size: 2 - set_sm_margin: 0 - -qkv_wgrad: - method: bulk - num_sm: 8 - cga_size: 2 - set_sm_margin: 0 - -fc1_dgrad: - method: bulk - num_sm: 2 - cga_size: 2 - set_sm_margin: 0 - -fc1_wgrad: - method: bulk - num_sm: 4 - cga_size: 2 - set_sm_margin: 0 - -## Ring-exchange overlap with AllGather -qkv_fprop: - method: ring_exchange - aggregate: 0 - -proj_dgrad: - method: ring_exchange - aggregate: 0 - -fc1_fprop: - method: ring_exchange - aggregate: 0 - -fc2_dgrad: - method: ring_exchange - aggregate: 1 - -# Chunked-collective overlap with ReduceScatter -proj_fprop: - method: pipeline - num_sm: 24 - cga_size: 2 - num_splits: 4 - set_sm_margin: 1 - -fc2_fprop: - method: pipeline - num_sm: 20 - cga_size: 2 - num_splits: 4 - set_sm_margin: 1 diff --git a/launcher_scripts/conf/training/gpt3/ub-confs/ub_cfg_h100_h12288_tp8_mbs2_seqlen2048.yaml b/launcher_scripts/conf/training/gpt3/ub-confs/ub_cfg_h100_h12288_tp8_mbs2_seqlen2048.yaml deleted file mode 100644 index 444c8245e0..0000000000 --- a/launcher_scripts/conf/training/gpt3/ub-confs/ub_cfg_h100_h12288_tp8_mbs2_seqlen2048.yaml +++ /dev/null @@ -1,59 +0,0 @@ -# UB communicator configurations -# Model configs: H100/175B/TP8/MBS2/SeqLen2K/FP8 - -# Bulk overlap with AllGather -qkv_dgrad: - method: bulk - num_sm: 8 - cga_size: 2 - set_sm_margin: 0 - -qkv_wgrad: - method: bulk - num_sm: 16 - cga_size: 2 - set_sm_margin: 0 - -fc1_dgrad: - method: bulk - num_sm: 4 - cga_size: 2 - set_sm_margin: 0 - -fc1_wgrad: - method: bulk - num_sm: 16 - cga_size: 2 - set_sm_margin: 0 - -## Ring-exchange overlap with AllGather -qkv_fprop: - method: ring_exchange - aggregate: 0 - -proj_dgrad: - method: ring_exchange - aggregate: 1 - -fc1_fprop: - method: ring_exchange - aggregate: 0 - -fc2_dgrad: - method: ring_exchange - aggregate: 0 - -# Chunked-collective overlap with ReduceScatter -proj_fprop: - method: pipeline - num_sm: 16 - cga_size: 2 - num_splits: 4 - set_sm_margin: 1 - -fc2_fprop: - method: pipeline - num_sm: 24 - cga_size: 2 - num_splits: 4 - set_sm_margin: 1 From 093bafc66df3424b148c7698f40cf1f074344ee5 Mon Sep 17 00:00:00 2001 From: Maanu Grover Date: Wed, 13 Sep 2023 09:19:58 -0700 Subject: [PATCH 57/62] move dist ckpt flag --- .../nemo_launcher/collections/checkpoint_search.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/launcher_scripts/nemo_launcher/collections/checkpoint_search.py b/launcher_scripts/nemo_launcher/collections/checkpoint_search.py index 6bdd7d5101..5cd080b96f 100755 --- a/launcher_scripts/nemo_launcher/collections/checkpoint_search.py +++ b/launcher_scripts/nemo_launcher/collections/checkpoint_search.py @@ -50,14 +50,14 @@ def checkpoint_search(cfg): tensor_model_parallel_size = cfg.tensor_model_parallel_size pipeline_model_parallel_size = cfg.pipeline_model_parallel_size - if checkpoint_name == "latest": + dist_ckpt = False + # Every distributed checkpoint saves a 'common.pt' file + for result in glob.glob(os.path.join(checkpoint_folder, "*")): + if os.path.exists(os.path.join(result, 'common.pt')): + dist_ckpt = True + break - dist_ckpt = False - # Every distributed checkpoint saves a 'common.pt' file - for result in glob.glob(os.path.join(checkpoint_folder, "*")): - if os.path.exists(os.path.join(result, 'common.pt')): - dist_ckpt = True - break + if checkpoint_name == "latest": if dist_ckpt: checkpoint_list = [f for f in glob.glob(os.path.join(checkpoint_folder, "*")) if os.path.isdir(f)] From e21358330ea14ed042c6486674357f5769de5355 Mon Sep 17 00:00:00 2001 From: Maanu Grover Date: Wed, 13 Sep 2023 09:24:38 -0700 Subject: [PATCH 58/62] update config --- launcher_scripts/conf/conversion/gpt3/convert_gpt3.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/launcher_scripts/conf/conversion/gpt3/convert_gpt3.yaml b/launcher_scripts/conf/conversion/gpt3/convert_gpt3.yaml index 2b80dd31aa..dc8d43b44d 100755 --- a/launcher_scripts/conf/conversion/gpt3/convert_gpt3.yaml +++ b/launcher_scripts/conf/conversion/gpt3/convert_gpt3.yaml @@ -13,7 +13,7 @@ run: model: model_type: gpt # gpt or t5, use t5 for mt5 as well checkpoint_folder: ${conversion.run.train_dir}/results/checkpoints - checkpoint_name: megatron_gpt-*last # latest OR name pattern of a checkpoint (e.g. megatron_gpt-*last.ckpt) + checkpoint_name: latest # latest OR name pattern of a checkpoint (e.g. megatron_gpt-*last.ckpt) hparams_file: ${conversion.run.train_dir}/results/hparams.yaml tensor_model_parallel_size: 1 pipeline_model_parallel_size: 1 From 4e7d80b102144f57d0023336966b14c411100dd7 Mon Sep 17 00:00:00 2001 From: "Zhenghang (Max) Xu" Date: Wed, 13 Sep 2023 14:20:55 -0700 Subject: [PATCH 59/62] Create peft t5 squad.yaml --- launcher_scripts/conf/peft/t5/squad.yaml | 230 +++++++++++++++++++++++ 1 file changed, 230 insertions(+) create mode 100644 launcher_scripts/conf/peft/t5/squad.yaml diff --git a/launcher_scripts/conf/peft/t5/squad.yaml b/launcher_scripts/conf/peft/t5/squad.yaml new file mode 100644 index 0000000000..cdd452bab3 --- /dev/null +++ b/launcher_scripts/conf/peft/t5/squad.yaml @@ -0,0 +1,230 @@ +name: megatron_t5_peft_tuning-${peft.model.peft.peft_scheme} + +run: + name: ${.task_name}_${.model_train_name} + time_limit: "04:00:00" + dependency: "singleton" + convert_name: convert_nemo + model_train_name: t5 + convert_dir: ${base_results_dir}/${peft.run.model_train_name}/${peft.run.convert_name} + task_name: "squad" + results_dir: ${base_results_dir}/${.model_train_name}/peft_${.task_name} + +trainer: + devices: 1 + accelerator: gpu + num_nodes: 1 + precision: 16 + logger: False # logger provided by exp_manager + enable_checkpointing: False + use_distributed_sampler: False + max_epochs: 9999 + max_steps: 20000 # consumed_samples = global_step * micro_batch_size * data_parallel_size * accumulate_grad_batches + log_every_n_steps: 10 # frequency with which training steps are logged + val_check_interval: 200 # If is an int n > 1, will run val every n training steps, if a float 0.0 - 1.0 will run val every epoch fraction, e.g. 0.25 will run val every quarter epoch + gradient_clip_val: 1.0 + +exp_manager: + explicit_log_dir: null + exp_dir: null + name: ${peft.name} + create_wandb_logger: False + wandb_logger_kwargs: + project: null + name: null + resume_if_exists: True + resume_ignore_no_checkpoint: True + create_checkpoint_callback: True + checkpoint_callback_params: + monitor: validation_${peft.model.data.validation_ds.metric.name} + save_top_k: 1 + mode: min + save_nemo_on_train_end: True + filename: '${peft.name}--{${peft.exp_manager.checkpoint_callback_params.monitor}:.3f}-{step}-{consumed_samples}' + model_parallel_size: ${peft.model.tensor_model_parallel_size} + always_save_nemo: False + save_best_model: True + create_early_stopping_callback: True + early_stopping_callback_params: + monitor: "val_loss" + mode: "min" + min_delta: 0.001 + patience: 10 + verbose: True + strict: False # Should be False to avoid a runtime error where EarlyStopping says monitor is unavailable, which sometimes happens with resumed training. + +model: + seed: 1234 + tensor_model_parallel_size: 1 # intra-layer model parallelism + pipeline_model_parallel_size: 1 # inter-layer model parallelism + + global_batch_size: 128 + micro_batch_size: 4 + restore_from_path: ??? # Path to an existing .nemo model you wish to add new tasks to or run inference with + resume_from_checkpoint: null # The path to a checkpoint file to continue the training, restores the whole state including the epoch, step, LR schedulers, apex, etc. + save_nemo_on_validation_end: False # Saves an inference ready .nemo file every time a checkpoint is saved during training. + sync_batch_comm: False + megatron_amp_O2: False + + ## Sequence Parallelism + # Makes tensor parallelism more memory efficient for LLMs (20B+) by parallelizing layer norms and dropout sequentially + # See Reducing Activation Recomputation in Large Transformer Models: https://arxiv.org/abs/2205.05198 for more details. + sequence_parallel: False + + ## Activation Checkpoint + activations_checkpoint_granularity: null # 'selective' or 'full' + activations_checkpoint_method: null # 'uniform', 'block', not used with 'selective' + # 'uniform' divides the total number of transformer layers and checkpoints the input activation + # of each chunk at the specified granularity + # 'block' checkpoints the specified number of layers per pipeline stage at the specified granularity + activations_checkpoint_num_layers: null # not used with 'selective' + activations_checkpoint_layers_per_pipeline: null + answer_only_loss: True + gradient_as_bucket_view: False + + hidden_dropout: 0.0 + attention_dropout: 0.0 + ffn_dropout: 0.0 + + peft: + peft_scheme: "adapter" # can be either adapter,ia3, or ptuning + restore_from_path: null + + # Used for adapter peft training + adapter_tuning: + type: 'parallel_adapter' # this should be either 'parallel_adapter' or 'linear_adapter' + adapter_dim: 32 + adapter_dropout: 0.0 + norm_position: 'pre' # This can be set to 'pre', 'post' or null, 'pre' is normally what is used. + column_init_method: 'xavier' # IGNORED if linear_adapter is used, options: xavier, zero or normal + row_init_method: 'zero' # IGNORED if linear_adapter is used, options: xavier, zero or normal + norm_type: 'mixedfusedlayernorm' # IGNORED if layer_adapter is used, options are ['layernorm', 'mixedfusedlayernorm'] + layer_selection: null # selects in which layers to add adapters, e.g. [1,12] will add adapters to layer 1 (lowest) and 12. null will apply adapters to all layers + weight_tying: False + position_embedding_strategy: null # used only when weight_tying is True + + lora_tuning: + adapter_dim: 32 + adapter_dropout: 0.0 + column_init_method: 'xavier' # IGNORED if linear_adapter is used, options: xavier, zero or normal + row_init_method: 'zero' # IGNORED if linear_adapter is used, options: xavier, zero or normal + layer_selection: null # selects in which layers to add lora adapters. e.g. [1,12] will add lora to layer 1 (lowest) and 12. null will apply adapters to all layers + weight_tying: False + position_embedding_strategy: null # used only when weight_tying is True + + # Used for p-tuning peft training + p_tuning: + virtual_tokens: 10 # The number of virtual tokens the prompt encoder should add at the start of the sequence + bottleneck_dim: 1024 # the size of the prompt encoder mlp bottleneck + embedding_dim: 1024 # the size of the prompt encoder embeddings + init_std: 0.023 + + ia3_tuning: + layer_selection: null # selects in which layers to add ia3 adapters. e.g. [1,12] will add lora to layer 1 (lowest) and 12. null will apply adapters to all layers + + data: + train_ds: + # Example of how to specify paths to multiple datasets + # file_names: + # - /path/to/squad.jsonl + # - /path/to/mnli.jsonl + # - /path/to/boolq.jsonl + # Example of how each dataset is formatted + # {'input': 'John von Neumann\nVon Neumann made fundamental contributions .... Q: What did the math of artificial viscosity do?', 'output': 'smoothed the shock transition without sacrificing basic physics'} + file_names: ${peft.model.data.validation_ds.file_names} # Path to a list of JSONL files corresponding to the source data. + global_batch_size: ${peft.model.global_batch_size} + micro_batch_size: ${peft.model.micro_batch_size} + shuffle: True + num_workers: 0 + memmap_workers: 2 + pin_memory: True + max_seq_length: 2048 + min_seq_length: 1 + drop_last: True + # Example of how to specify concat_sampling_probabilities + # concat_sampling_probabilities: + # - 0.5 + # - 0.25 + # - 0.25 + concat_sampling_probabilities: null # When providing a list of datasets, this arg defines the sampling probabilities from each dataset when strategy='random' + context_key: 'input' + label_key: 'output' + add_eos: True + add_sep: False + add_bos: False + separate_prompt_and_response_with_newline: False + truncation_field: "context" # Options: ['context', 'answer'] + index_mapping_dir: null # Path to a directory to write index mapping files. + prompt_template: "{input} {output}" # fstring to use for assistant prompt. Example: "Q: {input}\nA: {output}" + + validation_ds: + file_names: ${peft.model.data.validation_ds.file_names} # Path to a list of JSONL files corresponding to the source data. Data format is identical to train_ds. + names: null # Names of the corresponding datasets used to log metrics. + global_batch_size: ${peft.model.global_batch_size} + micro_batch_size: ${peft.model.micro_batch_size} + shuffle: False + num_workers: 0 + memmap_workers: ${peft.model.data.train_ds.memmap_workers} + pin_memory: True + max_seq_length: 2048 + min_seq_length: 1 + drop_last: False + context_key: 'input' + label_key: 'output' + add_eos: ${peft.model.data.train_ds.add_eos} + add_sep: ${peft.model.data.train_ds.add_sep} + add_bos: ${peft.model.data.train_ds.add_bos} + separate_prompt_and_response_with_newline: ${peft.model.data.train_ds.separate_prompt_and_response_with_newline} + write_predictions_to_file: False + output_file_path_prefix: null # Prefix of the file to write predictions to. + truncation_field: "context" # Options: ['context', 'answer'] + index_mapping_dir: null # Path to a directory to write index mapping files. + prompt_template: ${peft.model.data.train_ds.prompt_template} # fstring to use for assistant prompt. Example: "Q: {input}\nA: {output}" + tokens_to_generate: 32 # decide how many tokens we want to generate to evaluate performance with string metrics + metric: + name: "loss" # Name of the evaluation metric to use. Options: ['exact_string_match', 'loss'] + average: null # Average the metric over the dataset. Options: ['macro', 'micro']. Works only for 'F1', 'accuracy' etc. Refer to torchmetrics for metrics where this is supported. + num_classes: null + test_ds: + file_names: ${peft.model.data.validation_ds.file_names} # Path to a list of JSONL files corresponding to the source data. Data format is identical to train_ds. + names: null # Names of the corresponding datasets used to log metrics. + global_batch_size: ${peft.model.global_batch_size} + micro_batch_size: ${peft.model.micro_batch_size} + shuffle: False + num_workers: 0 + memmap_workers: ${peft.model.data.train_ds.memmap_workers} + pin_memory: True + max_seq_length: 2048 + min_seq_length: 1 + drop_last: False + context_key: 'input' + label_key: 'output' + add_eos: ${peft.model.data.train_ds.add_eos} + add_sep: ${peft.model.data.train_ds.add_sep} + add_bos: ${peft.model.data.train_ds.add_bos} + separate_prompt_and_response_with_newline: ${peft.model.data.train_ds.separate_prompt_and_response_with_newline} + write_predictions_to_file: False + output_file_path_prefix: null # Prefix of the file to write predictions to. + truncation_field: "context" # Options: ['context', 'answer'] + index_mapping_dir: null # Path to a directory to write index mapping files. + prompt_template: ${peft.model.data.train_ds.prompt_template} + tokens_to_generate: 32 # decide how many tokens we want to generate to evaluate performance with string metrics + metric: + name: "loss" # Name of the evaluation metric to use. Options: ['exact_string_match', 'loss'] + average: null # Average the metric over the dataset. Options: ['macro', 'micro']. Works only for 'F1', 'accuracy' etc. Refer to torchmetrics for metrics where this is supported. + num_classes: null + + optim: + name: fused_adam + lr: 1e-4 + weight_decay: 0.01 + betas: + - 0.9 + - 0.98 + sched: + name: CosineAnnealing + warmup_steps: 50 + min_lr: 0.0 # min_lr must be 0.0 for prompt learning when pipeline parallel > 1 + constant_steps: 0 # Constant steps should also be 0 when min_lr=0 + monitor: val_loss + reduce_on_plateau: false From 5057396aaeea1196897469b961266c29e642b4e9 Mon Sep 17 00:00:00 2001 From: "Zhenghang (Max) Xu" Date: Wed, 13 Sep 2023 14:24:13 -0700 Subject: [PATCH 60/62] Update stages.py with PEFT t5 support --- launcher_scripts/nemo_launcher/core/stages.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/launcher_scripts/nemo_launcher/core/stages.py b/launcher_scripts/nemo_launcher/core/stages.py index 4932e08717..3f317e5866 100755 --- a/launcher_scripts/nemo_launcher/core/stages.py +++ b/launcher_scripts/nemo_launcher/core/stages.py @@ -821,13 +821,13 @@ def _get_nemo_code_path(self, model_type: str) -> Path: :return: path current stage's essential nemo scripts code :rtype: Path """ - if model_type == "t5": - raise NotImplementedError("PEFT is not supported in NeMo Megatron t5 models.") + if model_type == "mt5": raise NotImplementedError("PEFT is not supported in NeMo Megatron mt5 models.") model_type_to_code_path = { "gpt3": self._nemo_code_path / "examples/nlp/language_modeling/tuning/megatron_gpt_peft_tuning.py", "llama": self._nemo_code_path / "examples/nlp/language_modeling/tuning/megatron_gpt_peft_tuning.py", + "t5": self._nemo_code_path / "examples/nlp/language_modeling/tuning/megatron_t5_peft_tuning.py", } return model_type_to_code_path[model_type] From 5d4ec27e4f3fe480e859db1a3010c15b598a77c9 Mon Sep 17 00:00:00 2001 From: dimapihtar Date: Mon, 18 Sep 2023 06:22:47 -0700 Subject: [PATCH 61/62] update support matrix Signed-off-by: dimapihtar --- README.md | 4 ++-- auto_configurator/conf/config.yaml | 2 +- auto_configurator/tests/config_tests/test_main_config.py | 2 +- launcher_scripts/conf/config.yaml | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index c9ae532213..fdea811332 100755 --- a/README.md +++ b/README.md @@ -1977,7 +1977,7 @@ launcher_scripts_path: ${auto_configurator_path}/../launcher_scripts fastertransformer_path: ${auto_configurator_path}/../FasterTransformer base_results_dir: ${auto_configurator_path}/results data_dir: ${launcher_scripts_path}/data -training_container: nvcr.io/ea-bignlp/nemofw-training:23.07-py3 +training_container: nvcr.io/ea-bignlp/ga-participants/nemofw-training:23.08.01 container_mounts: - null wandb: # Weights and Biases (W&B) logging. @@ -5401,7 +5401,7 @@ VALID_DATA_PATH=/path/to/val_actor TEST_DATA_PATH=/path/to/test_actor NEMO_RLHF_DIR=/opt/nemo-rlhf -CONTAINER="nvcr.io/ea-bignlp/nemofw-training:23.07-py3" +CONTAINER="nvcr.io/ea-bignlp/ga-participants/nemofw-training:23.08.01" mkdir -p $OUTPUT_DIR diff --git a/auto_configurator/conf/config.yaml b/auto_configurator/conf/config.yaml index ab712b4a83..d28e5060b6 100644 --- a/auto_configurator/conf/config.yaml +++ b/auto_configurator/conf/config.yaml @@ -18,7 +18,7 @@ launcher_scripts_path: ${auto_configurator_path}/../launcher_scripts base_results_dir: ${auto_configurator_path}/results data_dir: ${launcher_scripts_path}/data -training_container: nvcr.io/ea-bignlp/nemofw-training:23.07-py3 +training_container: nvcr.io/ea-bignlp/ga-participants/nemofw-training:23.08.01 container_mounts: - null diff --git a/auto_configurator/tests/config_tests/test_main_config.py b/auto_configurator/tests/config_tests/test_main_config.py index 492ff0395f..b989c57cfd 100644 --- a/auto_configurator/tests/config_tests/test_main_config.py +++ b/auto_configurator/tests/config_tests/test_main_config.py @@ -26,7 +26,7 @@ def test_config(self): base_results_dir: ${auto_configurator_path}/results data_dir: ${launcher_scripts_path}/data - training_container: nvcr.io/ea-bignlp/nemofw-training:23.07-py3 + training_container: nvcr.io/ea-bignlp/ga-participants/nemofw-training:23.08.01 container_mounts: - null diff --git a/launcher_scripts/conf/config.yaml b/launcher_scripts/conf/config.yaml index 80267d5a56..0e39bbe0af 100755 --- a/launcher_scripts/conf/config.yaml +++ b/launcher_scripts/conf/config.yaml @@ -39,7 +39,7 @@ data_dir: ${launcher_scripts_path}/data # Location to store and read the data. base_results_dir: ${launcher_scripts_path}/results # Location to store the results, checkpoints and logs. container_mounts: # List of additional paths to mount to container. They will be mounted to same path. - null -container: nvcr.io/ea-bignlp/nemofw-training:23.07-py3 +container: nvcr.io/ea-bignlp/ga-participants/nemofw-training:23.08.01 wandb_api_key_file: null # File where the w&B api key is stored. Key must be on the first line. From df55ff7d7cd9135a7502296172ab287313e36678 Mon Sep 17 00:00:00 2001 From: dimapihtar Date: Mon, 18 Sep 2023 06:53:43 -0700 Subject: [PATCH 62/62] update support matrix Signed-off-by: dimapihtar --- README.md | 39 +++++++++++++++++++-------------------- 1 file changed, 19 insertions(+), 20 deletions(-) diff --git a/README.md b/README.md index fdea811332..455c0be7d6 100755 --- a/README.md +++ b/README.md @@ -357,26 +357,25 @@ Figure 1: The GPT family architecture. The 5B variant includes 24 transformer la ### 3.1. Support Matrix -| Software | Version | -|-------------------------|------------------| -| NVIDIA Triton | 2.24.0 | -| FasterTransformer | v5.3+f8e42aa | -| TransformerEngine | v0.11+b172bad | -| MegatronCore | 4f8e9ac | -| PyTorch | 2.1.0a0+fe05266 | -| NeMo | 1.20.0+2baef81 | -| PyTorch Lightning | 1.9.4 | -| Hydra | 1.2.0 | -| CUDA | NVIDIA CUDA 12.1 | -| cuBLAS | 12.1.3.1 | -| cuDNN | 8.9.0.131 | -| NCCL | 2.17.1 | -| Container OS | Ubuntu 20.04 | -| rdma-core | 36.0 | -| GDRcopy | 2.3 | -| HPC-X | 2.13 | -| Base Command Manager | 1.0.0 | -| DeepOps | 21.06 | +| Software | Version | +|-------------------------|----------------------| +| NVIDIA Triton | 2.37.0.9383150 | +| TransformerEngine | 0.13.0.dev0+a03f8bc | +| MegatronCore | 0.3.0+ab0336a | +| PyTorch | 2.1.0a0+29c30b1 | +| NeMo | 1.21.0+b850d14 | +| PyTorch Lightning | 2.0.7 | +| Hydra | 1.2.0 | +| CUDA | NVIDIA CUDA 12.2 | +| cuBLAS | 12.2.5.1 | +| cuDNN | 8.9.4.25 | +| NCCL | 2.18.3 | +| Container OS | Ubuntu 22.04 | +| rdma-core | 39.0 | +| GDRcopy | 2.3 | +| HPC-X | 2.15 | +| Base Command Manager | 1.0.0 | +| DeepOps | 21.06 | ## 4. Cloud Service Providers