diff --git a/.gitignore b/.gitignore index 3e46cef4c5..edbde3c246 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,33 @@ +# User Added +*tmp* +*core.* +*old* +*.bak +**index-cache** +**pbslogs** +ezpz +*.o17* +*.e17* +*hostfile* +.deepspeed_env +*.DS_Store +old/* +**venv** +*.json +*.o1 +*.e1 +outputs/ +venvs/ +wandb/ +llama-logs/ +checkpoints/ +*.gz +*.txt +*.idx +*.bin +*.log +__pycache__ + .deepspeed_env *.bak .cache/* diff --git a/ALCF/test_sirius.sh b/ALCF/test_sirius.sh index 6199c5c157..108649612c 100755 --- a/ALCF/test_sirius.sh +++ b/ALCF/test_sirius.sh @@ -37,13 +37,11 @@ setup_megatron_deepspeed() { echo "Running test in: ${OUTDIR}" echo "WORKING DIRECTORY: $(realpath $(pwd .))" if [[ -d "Megatron-DeepSpeed" ]]; then - # rm -rfv Megatron-DeepSpeed/ - echo "Found existing Megatron-DeepSpeed. - Remove existing directory to run test." + echo "Found existing Megatron-DeepSpeed in ${OUTDIR}" + echo "Remove Megatron-DeepSpeed from ${OUTDIR} to run test." exit fi git clone https://github.com/argonne-lcf/Megatron-DeepSpeed && cd Megatron-DeepSpeed - git checkout remove-apex-deps } @@ -53,12 +51,10 @@ main() { export DEBUG=1 export PBS_O_WORKDIR="$(pwd)" export DATA_FILE_LIST=./ALCF/data-lists/sirius/books.txt - # LR=0.0008 - # GRAD_ACC_STEPS=8 export ZERO_STAGE=1 export NUM_LAYERS=10 export MICRO_BATCH=8 - export TRAIN_ITERS=20 + export TRAIN_ITER=20 export TIMING_LOG_LEVEL=1 bash train_llama_alcf.sh |& tee "test-sirius-${NOW}".log } diff --git a/ALCF/test_sunspot.sh b/ALCF/test_sunspot.sh index a8a4a21f32..67f6868d43 100755 --- a/ALCF/test_sunspot.sh +++ b/ALCF/test_sunspot.sh @@ -3,10 +3,19 @@ # Run complete test of # https://github.com/argonne-lcf/Megatron-DeepSpeed # on Sunspot @ ALCF +# to launch (inside an interactive `qsub -I` job) on Sirius: +# +# ```bash +# $ git clone https://github.com/argonne-lcf/Megatron-DeepSpeed +# $ cd Megatron-DeepSpeed/ALCF +# $ bash test_sunspot.sh +# ```` # EXIT ON ERROR(s) set -euxo pipefail +NOW="$(date "+%Y-%m-%d-%H%M%S")" + ######################################################## # Setup / activate conda environment, # mine is called q4-drop @@ -26,23 +35,30 @@ setup_conda() { # does not already exist ######################################## setup_megatron_deepspeed() { - mkdir tmp && cd tmp + OUTDIR="OUTPUTS/test-sunspot-${NOW}" && mkdir -p "${OUTDIR}" && cd "${OUTDIR}" + echo "Running test in: ${OUTDIR}" + echo "WORKING DIRECTORY: $(realpath $(pwd .))" if [[ -d "Megatron-DeepSpeed" ]]; then - # rm -rfv Megatron-DeepSpeed/ - echo "Found existing Megatron-DeepSpeed. - Remove existing directory to run test." + echo "Found existing Megatron-DeepSpeed in ${OUTDIR}" + echo "Remove Megatron-DeepSpeed from ${OUTDIR} to run test." exit fi git clone https://github.com/argonne-lcf/Megatron-DeepSpeed && cd Megatron-DeepSpeed - git checkout remove-apex-deps } main() { setup_conda setup_megatron_deepspeed - # NOTE: to use OPT=adamwschedulefree, you will need to pip install schedulefree - DEBUG=1 PBS_O_WORKDIR="$(pwd)" DATA_FILE_LIST=./ALCF/data-lists/sunspot/books.txt LR=0.0008 GRAD_ACC_STEPS=8 ZERO_STAGE=1 NUM_LAYERS=10 MICRO_BATCH=8 OPT=adamwschedulefree TIMING_LOG_LEVEL=1 bash train_llama_alcf.sh + export DEBUG=1 + export PBS_O_WORKDIR="$(pwd)" + export DATA_FILE_LIST=./ALCF/data-lists/sunspot/books.txt + export ZERO_STAGE=1 + export NUM_LAYERS=10 + export MICRO_BATCH=8 + export TRAIN_ITER=20 + export TIMING_LOG_LEVEL=1 + bash train_llama_alcf.sh |& tee "test-sunspot-${NOW}.log" } main diff --git a/pretrain_gpt_alcf.py b/pretrain_gpt_alcf.py index 6530340c19..7ed38614a7 100644 --- a/pretrain_gpt_alcf.py +++ b/pretrain_gpt_alcf.py @@ -8,7 +8,7 @@ import math from functools import partial from megatron import get_args -from megatron import print_rank_0 +# from megatron import print_rank_0 from megatron import get_timers from megatron import get_tokenizer from megatron.core import mpu, tensor_parallel @@ -19,12 +19,12 @@ from megatron.utils import get_ltor_masks_and_position_ids from megatron.utils import average_losses_across_data_parallel_group, update_rotary_pos_emb from megatron.arguments import core_transformer_config_from_args -from megatron.utils import ( - report_memory, - throughput_calculator, - checkpoint_throughput_calculator -) -from pathlib import Path +# from megatron.utils import ( +# # report_memory, +# # throughput_calculator, +# # checkpoint_throughput_calculator +# ) +# from pathlib import Path from enrich import get_logger import deepspeed @@ -33,7 +33,7 @@ import subprocess import wandb -import time +# import time from torch import nn import torch.nn.functional as F import ezpz as ez @@ -74,7 +74,7 @@ def model_provider(pre_process=True, post_process=True): """Build the model.""" - print_rank_0('building GPT model ...') + log.info('building GPT model ...') see_memory_usage("Before Building Model", force=True) args = get_args() config = core_transformer_config_from_args(args) @@ -118,7 +118,7 @@ def model_provider(pre_process=True, post_process=True): # We need to call model.set_batch_fn after deepspeed.initialize model._megatron_batch_fn = get_batch_pipe - # Predompute the attention mask and store it in args. + # Precompute the attention mask and store it in args. # This avoids having to pipeline it # as an activation during training. # The mask is constant, and thus we can reuse it. @@ -154,12 +154,9 @@ def model_provider(pre_process=True, post_process=True): ) num_params = sum(p.numel() for p in model.parameters() if p.requires_grad) - # print_rank_0('\n ------------------------ ') - # print_rank_0(f'num of parameters {num_params}') - # print_rank_0('------------------------\n ') - print_rank_0(80 * '-') - print_rank_0(f"Number of parameters in model: {num_params}") - print_rank_0(80 * '-') + log.info(80 * '-') + log.info(f"Number of parameters in model: {num_params}") + log.info(80 * '-') see_memory_usage("After Building Model", force=True) if wandb.run is not None: tbdir = args.tensorboard_dir @@ -342,7 +339,7 @@ def loss_func(loss_mask, moe_loss, mos_loss, output_tensor): 'moe loss': moe_loss, 'kd loss': mos_loss } - print_rank_0( + log.info( f'>>> total loss: {loss}, ' f'lm loss {averaged_loss[0]}, ' f'kd loss {mos_loss}' @@ -419,7 +416,8 @@ def forward_step(data_iterator, model): # Get the batch. timers('batch-generator', log_level=2).start() tokens, labels, loss_mask, attention_mask, position_ids = get_batch( - data_iterator) + data_iterator + ) timers('batch-generator').stop() if args.data_efficiency_curriculum_learning: @@ -492,11 +490,12 @@ def train_valid_test_datasets_provider(train_val_test_num_samples): """Build train, valid, and test datasets.""" args = get_args() - print_rank_0('> building train, validation, and test datasets ' - 'for GPT ...') + log.info( + '> building train, validation, and test datasets for GPT ...' + ) files = [] if args.data_file_list is not None: - print_rank_0(f"Reading datasets from {args.data_file_list}") + log.info(f"Reading datasets from {args.data_file_list}") with open(args.data_file_list, 'r') as flist: for f in flist.readlines(): w, fname = f.split() @@ -523,8 +522,9 @@ def train_valid_test_datasets_provider(train_val_test_num_samples): train_data_prefix=args.train_data_path, valid_data_prefix=args.valid_data_path, test_data_prefix=args.test_data_path, - data_cache_path=args.data_cache_path) - print_rank_0("> finished creating GPT datasets ...") + data_cache_path=args.data_cache_path, + ) + log.info("> finished creating GPT datasets ...") return train_ds, valid_ds, test_ds @@ -566,8 +566,6 @@ def git_ds_info(): def main(): - # if RANK == 0: - # setup_wandb() if os.getenv('TORCH_PROFILER_ENABLED') == '1': from torch.profiler import profile, record_function, ProfilerActivity with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA]) as prof: @@ -593,17 +591,26 @@ def main(): # args_defaults={'tokenizer_type': 'GPT2BPETokenizer'}, data_post_process=data_post_process ) - try: - from megatron.text_generation import generate_and_post_process - with torch.autocast(device_type=ez.get_torch_device(), dtype=torch.float16): - response, _, _, _ = generate_and_post_process(model, prompts=["Hello world", "Nature is", "Turing test comprises", "Explain solar eclipse"], tokens_to_generate=32) - if RANK == 0: - log.info(f'generation completed..\n response:{response}') - except ValueError as ve: - log.critical(f'ValueError: {ve}') - pass + # try: + # from megatron.text_generation import generate_and_post_process + # with torch.autocast(device_type=DEVICE, dtype=args.dtype): + # response, _, _, _ = generate_and_post_process( + # model, + # prompts=[ + # "Hello world", + # "Nature is", + # "Turing test comprises", + # "Explain solar eclipse" + # ], + # tokens_to_generate=32 + # ) + # if RANK == 0: + # log.info(f'generation completed..\n response:{response}') + # except ValueError as ve: + # log.critical(f'ValueError: {ve}') + # pass # dist.barrier() - model.train() + # model.train() return model @@ -623,4 +630,4 @@ def main(): print(f"wandb.run.name: {wandb.run.name}") print(f"wandb.run.url: {wandb.run.url}") wandb.finish() - sys.exit() + sys.exit(0) diff --git a/train_llama_alcf.sh b/train_llama_alcf.sh index 4aac1153c7..cd2d8213dd 100644 --- a/train_llama_alcf.sh +++ b/train_llama_alcf.sh @@ -132,7 +132,7 @@ run_cmd=" # ds_exec # echo "! Using $(which deepspeed)" -ds_report +# ds_report echo "${run_cmd}"