argonne-lcf · saforem2 · Dec 5, 2024 · Dec 18, 2024 · Dec 24, 2024
diff --git a/examples_deepspeed/finetune_hf_llama/README.md b/examples_deepspeed/finetune_hf_llama/README.md
@@ -10,9 +10,9 @@ The pre-trained weights can be found at [Hugging Face - LLAMA-7B](https://huggin
 
 #### 1. Converting Hugging Face Model Weights to Megatron-Deepspeed Model
 ```bash
-bash examples_deepspeed/finetune_hf_llama/finetune_llama.sh convert
+bash examples_deepspeed/finetune_hf_llama/finetune_llama.sh convert_hf2mds
 ```
-This command writes the Hugging Face model weights into the Megatron-Deepspeed model and saves it. You can adjust the parallel configuration in the script.
+This command writes the Hugging Face model weights into the Megatron-Deepspeed model and saves it. You can adjust the parallel configuration in the script.```convert_mds2hf``` can convert a Megatron-Deepspeed model into the Hugging Face format
 
 #### 2. Fine-tuning Process
 ```bash

diff --git a/examples_deepspeed/finetune_hf_llama/ds_config.json b/examples_deepspeed/finetune_hf_llama/ds_config.json
@@ -1,5 +1,11 @@
 {
   "train_batch_size" : 256,
   "train_micro_batch_size_per_gpu": 16,
-  "steps_per_print": 1
+  "steps_per_print": 100,
+  "zero_optimization": {
+    "stage": 0
+  },
+  "bf16": {
+    "enabled": true
+  }
 }
diff --git a/examples_deepspeed/finetune_hf_llama/ds_config_empty.json b/examples_deepspeed/finetune_hf_llama/ds_config_empty.json
@@ -0,0 +1,5 @@
+{
+  "train_batch_size" : 256,
+  "train_micro_batch_size_per_gpu": 16,
+  "steps_per_print": 100
+}
diff --git a/examples_deepspeed/finetune_hf_llama/finetune_llama.sh b/examples_deepspeed/finetune_hf_llama/finetune_llama.sh
@@ -43,6 +43,13 @@ cat <<EOT > $DS_CONFIG
 }
 EOT
 
+if [ "$1" = "convert_hf2mds" ]; then
+    DS_CONFIG_PATH="./examples_deepspeed/finetune_hf_llama/ds_config_empty.json"
+elif [ "$1" = "convert_mds2hf" ]; then
+    DS_CONFIG_PATH="./examples_deepspeed/finetune_hf_llama/ds_config_empty.json"
+else
+    DS_CONFIG_PATH="./examples_deepspeed/finetune_hf_llama/ds_config.json"
+fi
 
 covert_hf2mds_args="deepspeed tools/hf2megads_weight_converter.py \
 --hf-ckpt-num-shards 2 \
@@ -69,6 +76,7 @@ comm_args="--tensor-model-parallel-size $TP \
 --num-layers $NUM_LAYERS \
 --hidden-size $HIDDEN_SIZE \
 --num-attention-heads $NUM_HEADS \
+--finetune \
 --ffn-hidden-size $FFN_HIDDEN_SIZE \
 --attention-dropout 0 \
 --hidden-dropout 0 \
@@ -97,7 +105,7 @@ comm_args="--tensor-model-parallel-size $TP \
 --zero-stage 0 \
 --tokenizer-type HFTokenizer \
 --tokenizer-model $HF_LLAMA_PATH \
---deepspeed_config ./examples_deepspeed/finetune_hf_llama/ds_config.json \
+--deepspeed_config $DS_CONFIG_PATH \
 --deepspeed \
 --distributed-backend nccl \
 --num-workers 0 \