Merge pull request #10 from argonne-lcf/alcf-tests

Merge `alcf-tests` into `main`
saforem2 · Apr 25, 2024 · 7681642 · 7681642
2 parents 3145945 + a59a532
commit 7681642
Show file tree

Hide file tree

Showing 5 changed files with 100 additions and 51 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,33 @@
+# User Added
+*tmp*
+*core.*
+*old*
+*.bak
+**index-cache**
+**pbslogs**
+ezpz
+*.o17*
+*.e17*
+*hostfile*
+.deepspeed_env
+*.DS_Store
+old/*
+**venv**
+*.json
+*.o1
+*.e1
+outputs/
+venvs/
+wandb/
+llama-logs/
+checkpoints/
+*.gz
+*.txt
+*.idx
+*.bin
+*.log
+__pycache__
+
 .deepspeed_env
 *.bak
 .cache/*

diff --git a/ALCF/test_sirius.sh b/ALCF/test_sirius.sh
@@ -37,13 +37,11 @@ setup_megatron_deepspeed() {
     echo "Running test in: ${OUTDIR}"
     echo "WORKING DIRECTORY: $(realpath $(pwd .))"
     if [[ -d "Megatron-DeepSpeed" ]]; then
-        # rm -rfv Megatron-DeepSpeed/
-        echo "Found existing Megatron-DeepSpeed.
-        Remove existing directory to run test."
+        echo "Found existing Megatron-DeepSpeed in ${OUTDIR}"
+        echo "Remove Megatron-DeepSpeed from ${OUTDIR} to run test."
         exit
     fi
     git clone https://github.com/argonne-lcf/Megatron-DeepSpeed && cd Megatron-DeepSpeed
-    git checkout remove-apex-deps
 }
 
 
@@ -53,12 +51,10 @@ main() {
     export DEBUG=1
     export PBS_O_WORKDIR="$(pwd)"
     export DATA_FILE_LIST=./ALCF/data-lists/sirius/books.txt
-    # LR=0.0008
-    # GRAD_ACC_STEPS=8
     export ZERO_STAGE=1
     export NUM_LAYERS=10
     export MICRO_BATCH=8
-    export TRAIN_ITERS=20
+    export TRAIN_ITER=20
     export TIMING_LOG_LEVEL=1
     bash train_llama_alcf.sh |& tee "test-sirius-${NOW}".log
 }

diff --git a/ALCF/test_sunspot.sh b/ALCF/test_sunspot.sh
@@ -3,10 +3,19 @@
 # Run complete test of
 # https://github.com/argonne-lcf/Megatron-DeepSpeed
 # on Sunspot @ ALCF
+# to launch (inside an interactive `qsub -I` job) on Sirius:
+#
+# ```bash
+# $ git clone https://github.com/argonne-lcf/Megatron-DeepSpeed
+# $ cd Megatron-DeepSpeed/ALCF
+# $ bash test_sunspot.sh
+# ````
 
 # EXIT ON ERROR(s)
 set -euxo pipefail
 
+NOW="$(date "+%Y-%m-%d-%H%M%S")"
+
 ########################################################
 # Setup / activate conda environment,
 # mine is called q4-drop
@@ -26,23 +35,30 @@ setup_conda() {
 # does not already exist
 ########################################
 setup_megatron_deepspeed() {
-    mkdir tmp && cd tmp
+    OUTDIR="OUTPUTS/test-sunspot-${NOW}" && mkdir -p "${OUTDIR}" && cd "${OUTDIR}"
+    echo "Running test in: ${OUTDIR}"
+    echo "WORKING DIRECTORY: $(realpath $(pwd .))"
     if [[ -d "Megatron-DeepSpeed" ]]; then
-        # rm -rfv Megatron-DeepSpeed/
-        echo "Found existing Megatron-DeepSpeed.
-        Remove existing directory to run test."
+        echo "Found existing Megatron-DeepSpeed in ${OUTDIR}"
+        echo "Remove Megatron-DeepSpeed from ${OUTDIR} to run test."
         exit
     fi
     git clone https://github.com/argonne-lcf/Megatron-DeepSpeed && cd Megatron-DeepSpeed
-    git checkout remove-apex-deps
 }
 
 
 main() {
     setup_conda
     setup_megatron_deepspeed
-    # NOTE: to use OPT=adamwschedulefree, you will need to pip install schedulefree
-    DEBUG=1 PBS_O_WORKDIR="$(pwd)" DATA_FILE_LIST=./ALCF/data-lists/sunspot/books.txt LR=0.0008 GRAD_ACC_STEPS=8 ZERO_STAGE=1 NUM_LAYERS=10 MICRO_BATCH=8 OPT=adamwschedulefree TIMING_LOG_LEVEL=1 bash train_llama_alcf.sh
+    export DEBUG=1
+    export PBS_O_WORKDIR="$(pwd)"
+    export DATA_FILE_LIST=./ALCF/data-lists/sunspot/books.txt
+    export ZERO_STAGE=1
+    export NUM_LAYERS=10
+    export MICRO_BATCH=8
+    export TRAIN_ITER=20
+    export TIMING_LOG_LEVEL=1
+    bash train_llama_alcf.sh |& tee "test-sunspot-${NOW}.log"
 }
 
 main
diff --git a/pretrain_gpt_alcf.py b/pretrain_gpt_alcf.py
@@ -8,7 +8,7 @@
 import math
 from functools import partial
 from megatron import get_args
-from megatron import print_rank_0
+# from megatron import print_rank_0
 from megatron import get_timers
 from megatron import get_tokenizer
 from megatron.core import mpu, tensor_parallel
@@ -19,12 +19,12 @@
 from megatron.utils import get_ltor_masks_and_position_ids
 from megatron.utils import average_losses_across_data_parallel_group, update_rotary_pos_emb
 from megatron.arguments import core_transformer_config_from_args
-from megatron.utils import (
-    report_memory,
-    throughput_calculator,
-    checkpoint_throughput_calculator
-)
-from pathlib import Path
+# from megatron.utils import (
+#     # report_memory,
+#     # throughput_calculator,
+#     # checkpoint_throughput_calculator
+# )
+# from pathlib import Path
 from enrich import get_logger
 
 import deepspeed
@@ -33,7 +33,7 @@
 import subprocess
 import wandb
 
-import time
+# import time
 from torch import nn
 import torch.nn.functional as F
 import ezpz as ez
@@ -74,7 +74,7 @@
 
 def model_provider(pre_process=True, post_process=True):
     """Build the model."""
-    print_rank_0('building GPT model ...')
+    log.info('building GPT model ...')
     see_memory_usage("Before Building Model", force=True)
     args = get_args()
     config = core_transformer_config_from_args(args)
@@ -118,7 +118,7 @@ def model_provider(pre_process=True, post_process=True):
             # We need to call model.set_batch_fn after deepspeed.initialize
             model._megatron_batch_fn = get_batch_pipe
 
-            # Predompute the attention mask and store it in args.
+            # Precompute the attention mask and store it in args.
             # This avoids having to pipeline it
             # as an activation during training.
             # The mask is constant, and thus we can reuse it.
@@ -154,12 +154,9 @@ def model_provider(pre_process=True, post_process=True):
             )
 
     num_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
-    # print_rank_0('\n ------------------------ ')
-    # print_rank_0(f'num of parameters {num_params}')
-    # print_rank_0('------------------------\n ')
-    print_rank_0(80 * '-')
-    print_rank_0(f"Number of parameters in model: {num_params}")
-    print_rank_0(80 * '-')
+    log.info(80 * '-')
+    log.info(f"Number of parameters in model: {num_params}")
+    log.info(80 * '-')
     see_memory_usage("After Building Model", force=True)
     if wandb.run is not None:
         tbdir = args.tensorboard_dir
@@ -342,7 +339,7 @@ def loss_func(loss_mask, moe_loss, mos_loss, output_tensor):
                 'moe loss': moe_loss,
                 'kd loss': mos_loss
             }
-        print_rank_0(
+        log.info(
             f'>>> total loss: {loss}, '
             f'lm loss {averaged_loss[0]}, '
             f'kd loss {mos_loss}'
@@ -419,7 +416,8 @@ def forward_step(data_iterator, model):
     # Get the batch.
     timers('batch-generator', log_level=2).start()
     tokens, labels, loss_mask, attention_mask, position_ids = get_batch(
-        data_iterator)
+        data_iterator
+    )
     timers('batch-generator').stop()
 
     if args.data_efficiency_curriculum_learning:
@@ -492,11 +490,12 @@ def train_valid_test_datasets_provider(train_val_test_num_samples):
     """Build train, valid, and test datasets."""
     args = get_args()
 
-    print_rank_0('> building train, validation, and test datasets '
-                 'for GPT ...')
+    log.info(
+        '> building train, validation, and test datasets for GPT ...'
+    )
     files = []
     if args.data_file_list is not None:
-        print_rank_0(f"Reading datasets from {args.data_file_list}")
+        log.info(f"Reading datasets from {args.data_file_list}")
         with open(args.data_file_list, 'r') as flist:
             for f in flist.readlines():
                 w, fname = f.split()
@@ -523,8 +522,9 @@ def train_valid_test_datasets_provider(train_val_test_num_samples):
         train_data_prefix=args.train_data_path,
         valid_data_prefix=args.valid_data_path,
         test_data_prefix=args.test_data_path,
-        data_cache_path=args.data_cache_path)
-    print_rank_0("> finished creating GPT datasets ...")
+        data_cache_path=args.data_cache_path,
+    )
+    log.info("> finished creating GPT datasets ...")
 
     return train_ds, valid_ds, test_ds
 
@@ -566,8 +566,6 @@ def git_ds_info():
 
 
 def main():
-    # if RANK == 0:
-    #     setup_wandb()
     if os.getenv('TORCH_PROFILER_ENABLED') == '1':
         from torch.profiler import profile, record_function, ProfilerActivity
         with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA]) as prof:
@@ -593,17 +591,26 @@ def main():
             # args_defaults={'tokenizer_type': 'GPT2BPETokenizer'},
             data_post_process=data_post_process
         )
-    try:
-        from megatron.text_generation import generate_and_post_process
-        with torch.autocast(device_type=ez.get_torch_device(), dtype=torch.float16):
-            response, _, _, _ = generate_and_post_process(model, prompts=["Hello world", "Nature is", "Turing test comprises", "Explain solar eclipse"], tokens_to_generate=32)
-        if RANK == 0:
-            log.info(f'generation completed..\n response:{response}')
-    except ValueError as ve:
-        log.critical(f'ValueError: {ve}')
-        pass
+    # try:
+    #     from megatron.text_generation import generate_and_post_process
+    #     with torch.autocast(device_type=DEVICE, dtype=args.dtype):
+    #         response, _, _, _ = generate_and_post_process(
+    #             model,
+    #             prompts=[
+    #                 "Hello world",
+    #                 "Nature is",
+    #                 "Turing test comprises",
+    #                 "Explain solar eclipse"
+    #             ],
+    #             tokens_to_generate=32
+    #         )
+    #     if RANK == 0:
+    #         log.info(f'generation completed..\n response:{response}')
+    # except ValueError as ve:
+    #     log.critical(f'ValueError: {ve}')
+    #     pass
     # dist.barrier()
-    model.train()
+    # model.train()
     return model
 
 
@@ -623,4 +630,4 @@ def main():
         print(f"wandb.run.name: {wandb.run.name}")
         print(f"wandb.run.url: {wandb.run.url}")
         wandb.finish()
-    sys.exit()
+    sys.exit(0)
diff --git a/train_llama_alcf.sh b/train_llama_alcf.sh
@@ -132,7 +132,7 @@ run_cmd="
 
 # ds_exec
 # echo "! Using $(which deepspeed)"
-ds_report
+# ds_report
 
 echo "${run_cmd}"