argonne-lcf · saforem2 · Jun 26, 2024 · Jun 27, 2024 · Jun 27, 2024 · Jun 27, 2024
diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml
@@ -0,0 +1,35 @@
+name: python
+
+on:
+ workflow_dispatch:
+ pull_request:
+ branches:
+ '**'
+ schedule:
+ - cron: "0 0 * * *"
+
+concurrency:
+ group: ${{ github.workflow }}-${{ github.ref }}
+ cancel-in-progress: true
+
+jobs:
+ unit-tests:
+ strategy:
+ matrix:
+ pyVersion: ["3.7", "3.8", "3.9", "3.10"]
+ fail-fast: false
+
+ runs-on: ubuntu-22.04
+ container:
+ image: deepspeed/gh-builder:py${{ matrix.pyVersion }}
+
+ steps:
+ - uses: actions/checkout@v4
+
+ - name: environment
+ run: |
+ which python
+ python --version
+ - name: Install Megatron-DeepSpeed
+ run: |
+ pip3 install .
diff --git a/examples_deepspeed/finetune_hf_llama/ds_config.json b/examples_deepspeed/finetune_hf_llama/ds_config.json
@@ -1,11 +1,5 @@
 {
  "train_batch_size" : 256,
  "train_micro_batch_size_per_gpu": 16,
- "steps_per_print": 100,
- "zero_optimization": {
- "stage": 0
- },
- "bf16": {
- "enabled": true
- }
+ "steps_per_print": 1
 }
diff --git a/examples_deepspeed/finetune_hf_llama/finetune_llama.sh b/examples_deepspeed/finetune_hf_llama/finetune_llama.sh
@@ -1,8 +1,8 @@
 DS_CONFIG=./examples_deepspeed/finetune_hf_llama/ds_config.json
-DATASET_PATH=./alpaca_data.json
+DATASET_PATH=./examples_deepspeed/finetune_hf_llama/alpaca_data.json
 # dataset link: https://github.com/tatsu-lab/stanford_alpaca/blob/main/alpaca_data.json
 
-HF_LLAMA_PATH=/data/llama-7b/
+HF_LLAMA_PATH=/data/llama-2-7b-hf/
 # weights link: https://huggingface.co/huggyllama/llama-7b
 
 MICRO_BATCH_SIZE=16
@@ -44,11 +44,20 @@ cat <<EOT > $DS_CONFIG
 EOT
 
 
-covert_args="deepspeed tools/hf2megads_weight_converter.py \
+covert_hf2mds_args="deepspeed tools/hf2megads_weight_converter.py \
 --hf-ckpt-num-shards 2 \
---origin-hf-ckpt-dir $HF_LLAMA_PATH \
+--hf-ckpt-dir $HF_LLAMA_PATH \
+--load-mode auto \
 --save $MEGA_DS_LLAMA_PATH"
 
+covert_mds2hf_args="deepspeed tools/hf2megads_weight_converter.py \
+--hf-ckpt-num-shards 2 \
+--hf-ckpt-dir $HF_LLAMA_PATH \
+--load-mode auto \
+--to-hf-ckpt \
+--load $MEGA_DS_LLAMA_PATH \
+--save $HF_LLAMA_PATH'-hf-out' "
+
 finetune_args="deepspeed finetune_llama.py \
 --load $MEGA_DS_LLAMA_PATH"
 
@@ -98,8 +107,10 @@ comm_args="--tensor-model-parallel-size $TP \
 --no-gradient-accumulation-fusion \
 --repeated-dataloader"
 
-if [ "$1" = "convert" ]; then
- task_args="$covert_args"
+if [ "$1" = "convert_hf2mds" ]; then
+ task_args="$covert_hf2mds_args"
+elif [ "$1" = "convert_mds2hf" ]; then
+ task_args="$covert_mds2hf_args"
 else
  task_args="$finetune_args"
 fi

diff --git a/examples_deepspeed/pretrain_llama2_distributed.sh b/examples_deepspeed/pretrain_llama2_distributed.sh
@@ -41,6 +41,17 @@ GRAD_CLIP=1
 # activation_checkpoint="true"
 activation_checkpoint="false"
 
+LOG_TO_WANDB=0
+WANDB_ARGS=
+if [ $LOG_TO_WANDB -eq 1 ]
+then
+WANDB_ARGS="\
+ --wandb-project pretrain-llama2 \
+ --wandb-exp-name exp0 \
+ --wandb-save-dir ${BASE_PATH}/wandb \
+ "
+fi
+
 # Below configuration required for llama model as per llama paper
 # --no-query-key-layer-scaling \
 # --attention-dropout 0 \
@@ -53,7 +64,6 @@ activation_checkpoint="false"
 ######################################
 
 
-
 cat <<EOT > $DS_CONFIG
 {
  "train_batch_size" : $GLOBAL_BATCH_SIZE,
@@ -132,4 +142,5 @@ torchrun $DISTRIBUTED_ARGS \
  --normalization rmsnorm \
  --disable-bias-linear \
  --num-key-value-heads $NUM_KV_HEADS \
+ $WANDB_ARGS \
  $ds_args
diff --git a/examples_deepspeed/sequence_parallel/ds_pretrain_gpt_1.3B_seq_parallel_32k.sh b/examples_deepspeed/sequence_parallel/ds_pretrain_gpt_1.3B_seq_parallel_32k.sh
@@ -187,14 +187,6 @@ host="${HOSTNAME}"
 seed=1234
 num_workers=0
 
-data_path="BookCorpusDataset_text_document"
-if [ ! -f "BookCorpusDataset_text_document.bin" ]; then
- wget https://the-eye.eu/public/AI/pile_neox/data/BookCorpusDataset_text_document.bin
-fi
-if [ ! -f "BookCorpusDataset_text_document.idx" ]; then
- wget https://the-eye.eu/public/AI/pile_neox/data/BookCorpusDataset_text_document.idx
-fi
-
 vocab_path="gpt2-vocab.json"
 if [ ! -f "$vocab_path" ]; then
  wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json
@@ -204,6 +196,24 @@ if [ ! -f "$merge_path" ]; then
  wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt
 fi
 
+
+data_path="BookCorpusDataset_text_document"
+if [ ! -f "BookCorpusDataset_text_document.bin" ]; then
+ # Download the Bookcorpus dataset and convert to json
+ python preprocess_bookcorpus.py
+
+ # Process the dataset
+ python ${dir}/../../tools/preprocess_data.py \
+ --input ${data_path}.json \
+ --output-prefix "BookCorpusDataset" \
+ --vocab-file $vocab_path \
+ --merge-file $merge_path \
+ --dataset-impl mmap \
+ --tokenizer-type GPT2BPETokenizer \
+ --workers 32 \
+ --append-eod
+fi
+
 prescale_grad="true"
 jobname="gpt_${model_size}B_tok${train_tokens_in_billion}B"
 jobname="${jobname}_lr${lr}_min${min_lr}_w${lr_warmup_tokens_in_million}M_d${lr_decay_tokens_in_billion}B_${lr_decay_style}"

diff --git a/examples_deepspeed/sequence_parallel/preprocess_bookcorpus.py b/examples_deepspeed/sequence_parallel/preprocess_bookcorpus.py
@@ -0,0 +1,4 @@
+from datasets import load_dataset
+
+train_data = load_dataset('bookcorpus/bookcorpus', split='train')
+train_data.to_json("BookCorpusDataset_text_document.json", lines=True)
diff --git a/examples_deepspeed/universal_checkpointing/README.md b/examples_deepspeed/universal_checkpointing/README.md
@@ -10,12 +10,12 @@ This folder contains example scripts that demonstrate how to use Universal Check
 For ZeRO stage 1, we provide bash scripts for bf16 and fp16 training examples corresponding to the steps 1 and 3 above. The step 1 scripts launch a training run of TP=PP=DP=2 of 200 iterations that creates a checkpoint every 100 iterations. The step 3 scripts load a universal checkpoint of iteration 100 and resume training with TP=PP=2 and DP=1 for an additional 100 iterations. Users can modify these scripts to try out other save and resume 3D combinations (e.g., save TP=PP=DP=1 and resume TP=PP=DP=2). Tensorboard logs are created by both step 1 and 3 scripts to enable visual inspection of how well the loss curves of the initial and resumed training runs match, especially at iteration 101. 
 
 1. bf16:
- * run_bf16.sh: step 1
- * run_universal_bf16.sh: step 3
+ * megatron_gpt/run_bf16.sh: step 1
+ * megatron_gpt/run_universal_bf16.sh: step 3
 
 2. fp16:
- * run_fp16.sh: step 1 
- * run_universal_fp16.sh: step 3
+ * megatron_gpt/run_fp16.sh: step 1
+ * megatron_gpt/run_universal_fp16.sh: step 3
 
 Please note that these scripts should be run from the root folder of the repo (i.e., two levels above this README). For illustration, here are the commands for running the bf16 example. 
 
@@ -41,22 +41,22 @@ NOTE: Make sure to update your `BASE_DATA_PATH` path in the `run_[bf16/fp16].sh`
 
 ### Step 1: Create ZeRO checkpoint
 ```bash 
- bash examples_deepspeed/universal_checkpointing/run_bf16.sh 
+ bash examples_deepspeed/universal_checkpointing/megatron_gpt/run_bf16.sh
 ```
-By default the script will create the checkpoints in folder `z1_uni_ckpt/checkpoints/gpt2/z1/bf16/tp2_pp2_dp2_toy`
+By default the script will create the checkpoints in folder `z1_uni_ckpt/checkpoints/gpt2/z1/bf16/tp2_pp2_dp2_sp1_toy`
 
 ### Step 2: Convert ZeRO checkpoint of iteration 100 to Universal format
 Assuming the DeepSpeed source code is cloned into the home folder, the following command will generate universal checkpoint for iteration 100. 
 
 ```bash
 python ${HOME}/DeepSpeed/deepspeed/checkpoint/ds_to_universal.py \
- --input_folder z1_uni_ckpt/checkpoints/gpt2/z1/bf16/tp2_pp2_dp2_toy/global_step100 \
- --output_folder z1_uni_ckpt/checkpoints/gpt2/z1/bf16/tp2_pp2_dp2_toy/global_step100_universal
+ --input_folder z1_uni_ckpt/checkpoints/gpt2/z1/bf16/tp2_pp2_dp2_sp1_toy/global_step100 \
+ --output_folder z1_uni_ckpt/checkpoints/gpt2/z1/bf16/tp2_pp2_dp2_sp1_toy/global_step100_universal
 ```
 Note that we chose to create the universal checkpoint in the same checkpoint folder as the ZeRO checkpoint. This maintains the normal checkpoint folder structure expected by the Megatron-DeepSpeed code, which makes it easy to load universal checkpoints with little/no script or code changes. For clarity, we show below the contents of the checkpoint folder after creation of the universal checkpoint. Note that the conversion script creates `global_step100_universal` folder and `latest_universal` file. 
 
 ```bash
-ls -l z1_uni_ckpt/checkpoints/gpt2/z1/bf16/tp2_pp2_dp2_toy/
+ls -l z1_uni_ckpt/checkpoints/gpt2/z1/bf16/tp2_pp2_dp2_sp1_toy/
 total 48
 drwxr-xr-x 2 user group 4096 Oct 21 08:51 global_step100
 drwxr-xr-x 3 user group 4096 Oct 21 09:28 global_step100_universal
@@ -69,21 +69,23 @@ drwxr-xr-x 2 user group 4096 Oct 21 09:01 global_step200
 
 ### Step 3: Resume training with Universal checkpoint of iteration 100
 ```bash 
-bash examples_deepspeed/universal_checkpointing/run_universal_bf16.sh
+bash examples_deepspeed/universal_checkpointing/megatron_gpt/run_universal_bf16.sh
 ```
 This resumption script effects the loading of universal checkpoint rather than the ZeRO checkpoint in the folder by passing `--universal-checkpoint` command line flag to the main training script (i.e., `pretrain_gpt.py`). 
 
 Please see the corresponding [pull request](https://github.com/microsoft/Megatron-DeepSpeed/pull/276) for visualizations of matching loss values between original and universal checkpoint runs for bf16 and fp16 examples.
 
 Combining sequence parallelism with data parallelism is another good use case for universal checkpointing, see [sp pull request](https://github.com/microsoft/DeepSpeed/pull/4752) for example and visualization of matching loss values.
 
+Notes: The model weights using the ```--no-pipeline-parallel``` parameter and the model weights not using the ```--no-pipeline-parallel``` parameter are currently not supported for mutual conversion.
+
 ### TensorBoard Log Analysis
 
 The Universal Checkpointing example includes a TensorBoard analysis script that will generate `csv` files and `png` plots across the unviersal checkpointing training steps for comparison of training and validation loss curves.
 
 After Step 3 is completed, the script may be executed as follows:
 ```bash
-bash examples_deepspeed/universal_checkpointing/run_tb_analysis.sh z1_uni_ckpt
+bash examples_deepspeed/universal_checkpointing/megatron_gpt/run_tb_analysis_gpt.sh z1_uni_ckpt
 ```
 
 The script will output the following `csv` files:
@@ -116,4 +118,25 @@ Repeat steps in ZeRO stage 1 training above with the following modifications to
 * Set ZERO_STAGE=2 
 * Add `--no-pipeline-parallel` flag to deepspeed options 
 
-## ZeRO stage 3 training (**Coming soon**)
+## ZeRO stage 3 training
+Repeat steps in ZeRO stage 1 training above with the following modifications to your job batch scripts:
+* Set ZERO_STAGE=3
+* Add `--no-pipeline-parallel` flag to deepspeed options
+
+> **Note:** that the stage 3 universal checkpoint currently supports Data parallelism.
+
+Below is the visualization of the `png` files generated from ZeRO stage 3.
+
+<div align="center">
+ <img src="assets/image/uc_stage3_char_training_loss.png" alt="" width="600"/>
+
+ *Figure 1: Training LM loss curve for first 200 training steps of Step 1 (TP=1, PP=1, DP=4) and training steps 101 to 200 of Step 3 (TP=1, PP=1, DP=2), which was loaded using the Universal Checkpoint.*
+</div>
+
+<div align="center">
+ <img src="assets/image/uc_stage3_char_validation_loss.png" alt="" width="600"/>
+
+ *Figure 2: Validation LM loss curve for first 200 training steps of Step 1 (TP=1, PP=1, DP=4) and training steps 101 to 200 of Step 3 (TP=1, PP=1, DP=2), which was loaded using the Universal Checkpoint.*
+</div>
+
+
diff --git a/...deepspeed/universal_checkpointing/assets/image/uc_stage3_char_training_loss.png b/...deepspeed/universal_checkpointing/assets/image/uc_stage3_char_training_loss.png
diff --git a/...epspeed/universal_checkpointing/assets/image/uc_stage3_char_validation_loss.png b/...epspeed/universal_checkpointing/assets/image/uc_stage3_char_validation_loss.png