Merge pull request #131 from NVIDIA/23.08_ci

Merge 23.08 changes
NVIDIA · Sep 18, 2023 · d24f8ef · d24f8ef
2 parents 52d3c08 + b3e2cc4
commit d24f8ef
Show file tree

Hide file tree

Showing 128 changed files with 5,707 additions and 462 deletions.
diff --git a/.gitignore b/.gitignore
@@ -4,6 +4,7 @@
 #*.ipynb
 output
 result
+data
 *.pt
 tests/data/asr
 .DS_Store

diff --git a/Dockerfile b/Dockerfile
@@ -68,6 +68,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
         libsndfile1 \
         sox \
         swig \
+        openssh-server \
         libb64-dev && \
     rm -rf /var/lib/apt/lists/*
 
@@ -179,6 +180,12 @@ RUN pip install --no-cache-dir wandb==0.15.3 \
 # Copy FasterTransformer
 COPY --from=ft_builder /workspace/FasterTransformer FasterTransformer
 
+# Setup SSH config to allow mpi-operator to communicate with containers in k8s
+RUN echo "    UserKnownHostsFile /dev/null" >> /etc/ssh/ssh_config && \
+    sed -i 's/#\(StrictModes \).*/\1no/g' /etc/ssh/sshd_config && \
+    sed -i 's/#   StrictHostKeyChecking ask/    StrictHostKeyChecking no/' /etc/ssh/ssh_config && \
+    mkdir -p /var/run/sshd
+
 # Examples
 WORKDIR /workspace
 #COPY any user-facing example scripts should go in here

diff --git a/README.md b/README.md
@@ -145,8 +145,13 @@ The most recent version of the README can be found at [https://ngc.nvidia.com/co
   * [5.12 LoRA Model and Generalized PEFT Framework](#512-lora-model-and-generalized-peft-framework)
     + [5.12.1 PEFT Training and Inference for GPT-style Models](#5121-peft-training-and-inference-for-gpt-style-models)
       - [5.12.1.1 PEFT Training and Inference](#51211-peft-training-and-inference)
+      + [5.12.1.2 PEFT Training with NeMo Megatron Launcher](#51212-peft-training-with-nemo-megatron-launcher)
+        - [5.12.1.2.1 Common](#512121-common)
+        - [5.12.1.2.2 Slurm](#512122-slurm)
+        - [5.12.1.2.3 Base Command Platform](#512123-base-command-platform)
       - [5.12.2 PEFT Training and Inference for mT5/T5-style Models](#5122-peft-training-and-inference-for-mt5-t5-style-models)
       - [5.12.2.1 PEFT Training and Inference](#51221-peft-training-and-inference)
+    + [5.12.1 PEFT Training and Inference for GPT-style Models](#5121-peft-training-and-inference-for-gpt-style-models)
   * [5.13. Model Evaluation](#513-model-evaluation)
     + [5.13.1. GPT Evaluation](#5131-gpt-evaluation)
       - [5.13.1.1. Common](#51311-common)
@@ -352,26 +357,25 @@ Figure 1: The GPT family architecture. The 5B variant includes 24 transformer la
 ### 3.1. Support Matrix
 <a id="markdown-support-matrix" name="support-matrix"></a>
 
-| Software                | Version          |
-|-------------------------|------------------|
-| NVIDIA Triton           | 2.24.0           |
-| FasterTransformer       | v5.3+f8e42aa     |
-| TransformerEngine       | v0.11+b172bad    |
-| MegatronCore            | 4f8e9ac          |
-| PyTorch                 | 2.1.0a0+fe05266  |
-| NeMo                    | 1.20.0+2baef81   |
-| PyTorch Lightning       | 1.9.4            |
-| Hydra                   | 1.2.0            |
-| CUDA                    | NVIDIA CUDA 12.1 |
-| cuBLAS                  | 12.1.3.1         |
-| cuDNN                   | 8.9.0.131        |
-| NCCL                    | 2.17.1           |
-| Container OS            | Ubuntu 20.04     |
-| rdma-core               | 36.0             |
-| GDRcopy                 | 2.3              |
-| HPC-X                   | 2.13             |
-| Base Command Manager    | 1.0.0            |
-| DeepOps                 | 21.06            |
+| Software                | Version              |
+|-------------------------|----------------------|
+| NVIDIA Triton           | 2.37.0.9383150       |
+| TransformerEngine       | 0.13.0.dev0+a03f8bc  |
+| MegatronCore            | 0.3.0+ab0336a        |
+| PyTorch                 | 2.1.0a0+29c30b1      |
+| NeMo                    | 1.21.0+b850d14       |
+| PyTorch Lightning       | 2.0.7                |
+| Hydra                   | 1.2.0                |
+| CUDA                    | NVIDIA CUDA 12.2     |
+| cuBLAS                  | 12.2.5.1             |
+| cuDNN                   | 8.9.4.25             |
+| NCCL                    | 2.18.3               |
+| Container OS            | Ubuntu 22.04         |
+| rdma-core               | 39.0                 |
+| GDRcopy                 | 2.3                  |
+| HPC-X                   | 2.15                 |
+| Base Command Manager    | 1.0.0                |
+| DeepOps                 | 21.06                |
 
 ## 4. Cloud Service Providers
 <a id="markdown-cloud-service-providers" name="cloud-service-providers"></a>
@@ -1972,7 +1976,7 @@ launcher_scripts_path: ${auto_configurator_path}/../launcher_scripts
 fastertransformer_path: ${auto_configurator_path}/../FasterTransformer
 base_results_dir: ${auto_configurator_path}/results
 data_dir: ${launcher_scripts_path}/data
-training_container: nvcr.io/ea-bignlp/nemofw-training:23.07-py3
+training_container: nvcr.io/ea-bignlp/ga-participants/nemofw-training:23.08.01
 container_mounts:
     - null
 wandb:  # Weights and Biases (W&B) logging.
@@ -3777,6 +3781,141 @@ inference.outfile_path=<OUTPUT_FILE>
 ```
 Additionally, NeMo has a notebook which walks through the steps (which these scripts encapsulate) to train and run inference for PEFT models: https://github.com/NVIDIA/NeMo/blob/main/tutorials/nlp/lora.ipynb
 
+##### 5.12.1.2 PEFT Training with NeMo Megatron Launcher
+PEFT stage could launch PEFT methods including PTuning, LoRA, Adapters and IA3 in a single stage, by setting different peft scheme.
+It is implemented via adapter_mixins framework with a unify style.
+mix-n-match PEFT scheme like adapter_and_ptuning can be easily extended to do ia3_and_ptuning or lora_and_ptuning
+
+PTuning does not need to flexibility to insert prompt tokens anywhere in the input. This feature has been removed for simplicity.
+
+##### 5.12.1.2.1. Common
+<a id="markdown-common" name="common"></a>
+To specify the configuration for ptuning (LoRA, adapter or IA3 learning), 
+use all the `run` parameters to define the job specific config:
+```yaml
+run:
+  name: ${.task_name}_${.model_train_name}
+  time_limit: "04:00:00"
+  dependency: "singleton"
+  convert_name: convert_nemo
+  model_train_name: gpt3_1.3B
+  task_name: "squad"
+  results_dir: ${base_results_dir}/${.model_train_name}/ptuning_${.task_name}
+```
+
+To specify which language model checkpoint to load and its definition, use the `model` parameter:
+
+```yaml
+model:
+  language_model_path: ${base_results_dir}/${peft.run.model_train_name}/${peft.run.convert_name}/nemo_gpt1.3B_fp16.nemo
+  tensor_model_parallel_size: 2
+  pipeline_model_parallel_size: 1
+```
+
+##### 5.12.1.2.2 Slurm
+<a id="markdown-slurm" name="slurm"></a>
+
+Set configuration for a Slurm cluster in the `conf/cluster/bcm.yaml` file:
+
+```yaml
+partition: null
+account: null
+exclusive: True
+gpus_per_task: null
+gpus_per_node: 8
+mem: 0
+overcommit: False
+job_name_prefix: "nemo-megatron-"
+```
+
+**Example:**
+
+To run only the evaluation pipeline and not the data preparation, training, 
+conversion or inference pipelines set the `conf/config.yaml` file to:
+
+```yaml
+stages:
+  - peft
+```
+
+then run:
+```
+python3 main.py \
+    peft=gpt3/squad \
+    stages=["peft"] \
+    peft.model.peft.peft_scheme="ptuning" \
+    peft.model.megatron_amp_O2=False \
+    peft.model.restore_from_path=${LANGUAGE_MODEL_PATH}\
+    peft.exp_manager.exp_dir=${BASE_RESULTS_DIR}/${RUN_NAME}/ptuning \
+
+```
+##### 5.12.1.2.3 Base Command Platform
+<a id="markdown-base-command-platform" name="base-command-platform"></a>
+In order to run the ptuning learning script on Base Command Platform, set the
+`cluster_type` parameter in `conf/config.yaml` to `bcp` or `interactive`. This can also be overridden
+from the command line, using hydra. 
+
+To run the ptuning pipeline to nemo-megatron-gpt-1.3B model converted checkpoint, run:
+```bash
+export HYDRA_FULL_ERROR=1
+export TORCH_CPP_LOG_LEVEL=INFO NCCL_DEBUG=INFO
+  
+TRAIN="[/mount/workspace/databricks-dolly-15k-train.jsonl]"
+VALID="[/mount/workspace/databricks-dolly-15k-val.jsonl]"
+VALID_NAMES="[peft-squad]"
+CONCAT_SAMPLING_PROBS="[1]"
+ 
+PEFT_SCHEME="ptuning"
+PEFT_EXP_DIR="/results/nemo_launcher/ptuning"
+LOG_DIR="/results/nemo_launcher/ptuning_log"
+ 
+TP_SIZE=2
+ 
+PP_SIZE=1
+ 
+python3 /opt/NeMo-Megatron-Launcher/launcher_scripts/main.py \
+        peft=gpt3/squad \
+        stages=[peft] \
+        cluster_type=interactive \
+        launcher_scripts_path=/opt/NeMo-Megatron-Launcher/launcher_scripts \
+        peft.model.peft.peft_scheme=${PEFT_SCHEME} \
+        peft.trainer.precision=bf16 \
+        peft.trainer.max_steps=100 \
+        peft.trainer.devices=2 \
+        peft.trainer.val_check_interval=10 \
+        peft.model.megatron_amp_O2=False \
+        peft.model.restore_from_path=/mount/workspace/nemo_gpt1.3B_fp16.nemo \
+        peft.model.tensor_model_parallel_size=${TP_SIZE} \
+        peft.model.pipeline_model_parallel_size=${PP_SIZE} \
+        peft.model.optim.lr=5e-6 \
+        peft.model.answer_only_loss=True \
+        peft.model.data.train_ds.file_names=${TRAIN} \
+        peft.model.data.train_ds.micro_batch_size=1 \
+        peft.model.data.train_ds.global_batch_size=32 \
+        peft.model.data.train_ds.concat_sampling_probabilities=${CONCAT_SAMPLING_PROBS} \
+        peft.model.data.validation_ds.micro_batch_size=1 \
+        peft.model.data.validation_ds.global_batch_size=32 \
+        peft.model.data.validation_ds.file_names=${VALID} \
+        peft.model.data.validation_ds.names=${VALID_NAMES} \
+        peft.model.data.test_ds.micro_batch_size=1 \
+        peft.model.data.test_ds.global_batch_size=128 \
+        peft.model.data.train_ds.num_workers=0 \
+        peft.model.data.validation_ds.num_workers=0 \
+        peft.model.data.test_ds.num_workers=0 \
+        peft.model.data.validation_ds.metric.name=loss \
+        peft.model.data.test_ds.metric.name=loss \
+        peft.exp_manager.exp_dir=${PEFT_EXP_DIR} \
+        peft.exp_manager.explicit_log_dir=${LOG_DIR} \
+        peft.exp_manager.resume_if_exists=True \
+        peft.exp_manager.resume_ignore_no_checkpoint=True \
+        peft.exp_manager.create_checkpoint_callback=True \
+        peft.exp_manager.checkpoint_callback_params.monitor=validation_loss
+```
+
+The command above assumes you mounted the data workspace in `/mount/workspace/` (e.g. the example script uses databricks-dolly-15k dataset), and the results workspace in `/results`. The command needs set different peft.exp_manager.exp_dir for different PEFT jobs.
+The stdout and stderr outputs will also be redirected to the `/results/nemo_launcher/ptuning_log`, to be able to download the logs from NGC.
+Any other parameter can also be added to the command to modify its behavior.
+
 ##### 5.12.2 PEFT Training and Inference for mT5/T5-style Models
 We offer training and inference scripts in NeMo for parameter efficient tuning of mT5/T5-style models. You can train a LoRA, P-tuning, Adapter, or IA3 model using its corresponding training and inference script. 
 
@@ -5261,7 +5400,7 @@ VALID_DATA_PATH=/path/to/val_actor
 TEST_DATA_PATH=/path/to/test_actor
 
 NEMO_RLHF_DIR=/opt/nemo-rlhf
-CONTAINER="nvcr.io/ea-bignlp/nemofw-training:23.07-py3"
+CONTAINER="nvcr.io/ea-bignlp/ga-participants/nemofw-training:23.08.01"
 
 mkdir -p $OUTPUT_DIR
 
@@ -5458,6 +5597,8 @@ Currently, within the NeMo Data Curator, we support the following data-curation
    - Fuzzy deduplication. Our implementation of fuzzy deduplication builds off of the following existing libraries:
      - For computing MinHash signatures we use a modified version of the MinHasher class provided in [pyLSH](https://github.com/mattilyra/LSH)
      - For the locality sensitive hashing, we extended the Redis-based implementation found in [datasketch](https://github.com/ekzhu/datasketch) beyond a single Redis server to a Redis Cluster. This enables this module to efficiently deduplicate large datasets that do not fit in memory of a single node (e.g., several TB of text)
+ - Multilingual downstream-task decontamination
+    -  Our implementation follows the approach of [OpenAI GPT3](https://arxiv.org/pdf/2005.14165.pdf) and [Microsoft Turing NLG 530B](https://arxiv.org/abs/2201.11990)
 
 The modules are implemented in a scalable manner using [Message Passing Interface (MPI) for Python (mpi4py)](https://mpi4py.readthedocs.io/en/stable/) and we use [Dask](https://dask.org) for creating balanced input jsonl files. With the scalable modules within the NeMo Data Curator, we have been have been able to fully process a [Common Crawl Snapshot](https://commoncrawl.org/2020/12/nov-dec-2020-crawl-archive-now-available/) (consisting of 60 TB of compressed WARC files) in approximately two days using 30 CPU nodes (with hardware similar to the `c5.24xlarge` [Amazon AWS C5 instance](https://aws.amazon.com/ec2/instance-types/c5/)). Please note that the core functions used within the NeMo Data Curator (e.g., html extraction, text cleaning, heuristic filtering, etc.) have not been fully optimized. The main goal of the NeMo Data Curator is to provide users the capability to apply these functions to their large datasets using many compute nodes.
 

diff --git a/auto_configurator/autoconfig/scripts/compare_throughput.py b/auto_configurator/autoconfig/scripts/compare_throughput.py
@@ -16,7 +16,7 @@ def main(cfg):
     settings_cfg = cfg.search_config.train_settings
     model_size = settings_cfg.model_size_in_b
     output_top_n = settings_cfg.output_top_n
-    nodes = cfg.get("nodes")
+    nodes = settings_cfg.num_nodes
 
     training_logs = os.path.join(settings_cfg.get("logs"), "training_logs")
     candidate_configs = os.path.join(settings_cfg.get("logs"), "candidate_configs")
@@ -77,11 +77,11 @@ def main(cfg):
         model_name = candidate_cfg.get("run").get("name").split("_")[0]
         gbs = model_cfg.get("global_batch_size")
         enc_seq_len = (
-            model_cfg.get("encoder_seq_length") if model_name in ("gpt3", "bert") else model_cfg.get("seq_length")
+            model_cfg.get("encoder_seq_length") if model_name in ("gpt3", "bert", "llama") else model_cfg.get("seq_length")
         )
         dec_seq_len = data_cfg.get("seq_length_dec")
 
-        if model_name in ("gpt3", "bert"):
+        if model_name in ("gpt3", "bert", "llama"):
             hs = model_cfg.get("hidden_size")
             ffn_hs = None
             layers = model_cfg.get("num_layers")
@@ -184,14 +184,14 @@ def main(cfg):
                 finally:
                     continue
 
-    result.sort(key=lambda x: x[14])
+    result.sort(key=lambda x: x[15])
     print(f"Top {min(output_top_n, len(result))} configs sorted from fastest to slowest:")
     for i, res in enumerate(result):
         print(f"Config #{i+1}: {res[-1]} with {res[14]:.4f}s per global step.")
         if i + 1 == output_top_n:
             break
 
-    top_config = f"{model_name}_{model_size}b_{nodes}nodes_tp_{result[0][2]}_pp_{result[0][3]}_mbs_{result[0][4]}_act_ckpt_{result[0][5]}_num_mbs_act_{result[0][6]}_act_per_pipe_{result[0][7]}"
+    top_config = f"{model_name}_{model_size}b_{nodes}nodes_tp_{result[0][3]}_pp_{result[0][4]}_mbs_{result[0][5]}_act_ckpt_{result[0][6]}_num_mbs_act_{result[0][7]}_act_per_pipe_{result[0][8]}"
     print("\n==================================================")
     print(f"Optimal config: {top_config} with {result[0][14]:.4f}s per global step.")
     print(f"Saving config to {final_result_logs}/optimal_config_{model_size}b_{nodes}nodes.yaml.")
@@ -223,7 +223,7 @@ def calculate_tflops(
     Bert Formula: 
         Model FLOPs = 72BLsh^2 * ( 1 + (s/6h) + (v/12hL))
     """
-    if model_name == "gpt3":
+    if model_name in ["gpt3", "llama"]:
         # Model FLOPS calculation
         model_flops = (
             (24 * gbs * enc_seq_len * hs * hs + 4 * gbs * enc_seq_len * enc_seq_len * hs) * (3 * layers)

diff --git a/auto_configurator/autoconfig/search_config.py b/auto_configurator/autoconfig/search_config.py
@@ -20,7 +20,7 @@
 from autoconfig.inference_sweep import search_inference_config
 from autoconfig.training_config import search_training_config
 
-SUPPORTED_MODELS = ["gpt3", "t5", "mt5", "bert"]
+SUPPORTED_MODELS = ["gpt3", "t5", "mt5", "bert", "llama"]
 
 
 def search_config(cfg: omegaconf.dictconfig.DictConfig, hydra_args: Optional[str] = None):

diff --git a/auto_configurator/autoconfig/training_config.py b/auto_configurator/autoconfig/training_config.py
@@ -69,12 +69,12 @@ def generate_grid_search_configs(
     act_layers = train_cfg.get("act_ckpt_layers")
 
     # 2 * num_layers is needed because of encoder/decoder architecture.
-    multiplier = 1 if model_name in ["gpt3", "bert"] else 2
+    multiplier = 1 if model_name in ["gpt3", "bert", "llama"] else 2
 
     seq_length = base_cfg["model"]["data"]["seq_length"]
     num_layers = (
         base_cfg["model"]["num_layers"]
-        if model_name in ["gpt3", "bert"]
+        if model_name in ["gpt3", "bert", "llama"]
         else base_cfg["model"]["encoder"]["num_layers"]
     )
 
@@ -96,7 +96,7 @@ def generate_grid_search_configs(
             for mbs in mbs_list:
                 num_gpus = base_cfg["trainer"]["num_nodes"] * base_cfg["trainer"]["devices"]
                 gbs = base_cfg["model"]["global_batch_size"]
-                if model_name in ["gpt3", "bert"]:
+                if model_name in ["gpt3", "bert", "llama"]:
                     att_heads = base_cfg["model"]["num_attention_heads"]
                     num_layers = base_cfg["model"]["num_layers"]
                 else:
@@ -175,7 +175,7 @@ def _set_activations_checkpoint_params(tp, pp, num_layers, act_method, multiplie
     min_layers_per_pipe = 0
     max_layers_per_pipe = num_layers
     interval_layers_per_pipe = act_multiple
-    if model_name in ["gpt3", "bert"] and pp > 2:  # Interleaved pipeline scheduling.
+    if model_name in ["gpt3", "bert", "llama"] and pp > 2:  # Interleaved pipeline scheduling.
         virtual_pipelines = num_layers // pp  # TODO: verify that this is the best value.
         act_multiple = 1
         max_micro_b = pp * (virtual_pipelines - 1) + (pp - 1) * 2 + 1
@@ -190,7 +190,7 @@ def _set_activations_checkpoint_params(tp, pp, num_layers, act_method, multiplie
         else:
             act_ckpt_layers = range(0, multiplier * num_layers // pp // virtual_pipelines + 1, act_multiple)
 
-        if pp > 1 and model_name in ["gpt3", "bert"]:
+        if pp > 1 and model_name in ["gpt3", "bert", "llama"]:
             # Num micro batches with partial act ckpt
             num_micro_batches_partial_act_ckpt = list(range(min_micro_b, max_micro_b + 1, interval_micro_b))
             if num_micro_batches_partial_act_ckpt[0] == 0:
@@ -304,6 +304,12 @@ def _tp_pp_mbs_grid_gpt3_80gb(model_size_in_b: float, valid_pp: List[int], seq_l
             mbs = [1, 2]
             min_model_parallel = 8
             max_model_parallel = 32
+        elif model_size_in_b <= 95:
+            tp = [4, 8]
+            pp = [x for x in valid_pp if 1 <= x <= 8]
+            mbs = [1, 2]
+            min_model_parallel = 8
+            max_model_parallel = 64
     elif seq_length == 8192:
         if model_size_in_b <= 1.0:
             tp = [1, 2]
@@ -738,13 +744,13 @@ def _calculate_tp_pp_mbs_grid(
     mbs_sizes = train_cfg.get("micro_batch_sizes")
     gpu_memory_gb = train_cfg.get("gpu_memory_gb")
 
-    multiplier = 1 if model_name in ["gpt3", "bert"] else 2
-    init_pp = [] if model_name == "gpt3" else [1]
+    multiplier = 1 if model_name in ["gpt3", "bert", "llama"] else 2
+    init_pp = [] if model_name in ["gpt3", "llama"] else [1]
     valid_pp = init_pp + [
         multiplier * x for x in range(1, num_layers + 1) if num_layers % x == 0
     ]  # Only divisors of num_layers are possible.
 
-    if model_name == "gpt3":
+    if model_name in ["gpt3", "llama"]:
         if gpu_memory_gb == 80:
             tp, pp, mbs, min_model_parallel, max_model_parallel = _tp_pp_mbs_grid_gpt3_80gb(
                 model_size_in_b=model_size_in_b, valid_pp=valid_pp, seq_length=seq_length
-Original file line number
+Diff line change
@@ Expand Up / @@ -4,6 +4,7 @@ @@
     #*.ipynb
     output
     result
+    data
     *.pt
     tests/data/asr
     .DS_Store
@@ Expand Down @@