Skip to content

Commit

Permalink
Merge pull request #10 from argonne-lcf/alcf-tests
Browse files Browse the repository at this point in the history
Merge `alcf-tests` into `main`
  • Loading branch information
saforem2 authored Apr 25, 2024
2 parents 3145945 + a59a532 commit 7681642
Show file tree
Hide file tree
Showing 5 changed files with 100 additions and 51 deletions.
30 changes: 30 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,33 @@
# User Added
*tmp*
*core.*
*old*
*.bak
**index-cache**
**pbslogs**
ezpz
*.o17*
*.e17*
*hostfile*
.deepspeed_env
*.DS_Store
old/*
**venv**
*.json
*.o1
*.e1
outputs/
venvs/
wandb/
llama-logs/
checkpoints/
*.gz
*.txt
*.idx
*.bin
*.log
__pycache__

.deepspeed_env
*.bak
.cache/*
Expand Down
10 changes: 3 additions & 7 deletions ALCF/test_sirius.sh
Original file line number Diff line number Diff line change
Expand Up @@ -37,13 +37,11 @@ setup_megatron_deepspeed() {
echo "Running test in: ${OUTDIR}"
echo "WORKING DIRECTORY: $(realpath $(pwd .))"
if [[ -d "Megatron-DeepSpeed" ]]; then
# rm -rfv Megatron-DeepSpeed/
echo "Found existing Megatron-DeepSpeed.
Remove existing directory to run test."
echo "Found existing Megatron-DeepSpeed in ${OUTDIR}"
echo "Remove Megatron-DeepSpeed from ${OUTDIR} to run test."
exit
fi
git clone https://github.com/argonne-lcf/Megatron-DeepSpeed && cd Megatron-DeepSpeed
git checkout remove-apex-deps
}


Expand All @@ -53,12 +51,10 @@ main() {
export DEBUG=1
export PBS_O_WORKDIR="$(pwd)"
export DATA_FILE_LIST=./ALCF/data-lists/sirius/books.txt
# LR=0.0008
# GRAD_ACC_STEPS=8
export ZERO_STAGE=1
export NUM_LAYERS=10
export MICRO_BATCH=8
export TRAIN_ITERS=20
export TRAIN_ITER=20
export TIMING_LOG_LEVEL=1
bash train_llama_alcf.sh |& tee "test-sirius-${NOW}".log
}
Expand Down
30 changes: 23 additions & 7 deletions ALCF/test_sunspot.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,19 @@
# Run complete test of
# https://github.com/argonne-lcf/Megatron-DeepSpeed
# on Sunspot @ ALCF
# to launch (inside an interactive `qsub -I` job) on Sirius:
#
# ```bash
# $ git clone https://github.com/argonne-lcf/Megatron-DeepSpeed
# $ cd Megatron-DeepSpeed/ALCF
# $ bash test_sunspot.sh
# ````

# EXIT ON ERROR(s)
set -euxo pipefail

NOW="$(date "+%Y-%m-%d-%H%M%S")"

########################################################
# Setup / activate conda environment,
# mine is called q4-drop
Expand All @@ -26,23 +35,30 @@ setup_conda() {
# does not already exist
########################################
setup_megatron_deepspeed() {
mkdir tmp && cd tmp
OUTDIR="OUTPUTS/test-sunspot-${NOW}" && mkdir -p "${OUTDIR}" && cd "${OUTDIR}"
echo "Running test in: ${OUTDIR}"
echo "WORKING DIRECTORY: $(realpath $(pwd .))"
if [[ -d "Megatron-DeepSpeed" ]]; then
# rm -rfv Megatron-DeepSpeed/
echo "Found existing Megatron-DeepSpeed.
Remove existing directory to run test."
echo "Found existing Megatron-DeepSpeed in ${OUTDIR}"
echo "Remove Megatron-DeepSpeed from ${OUTDIR} to run test."
exit
fi
git clone https://github.com/argonne-lcf/Megatron-DeepSpeed && cd Megatron-DeepSpeed
git checkout remove-apex-deps
}


main() {
setup_conda
setup_megatron_deepspeed
# NOTE: to use OPT=adamwschedulefree, you will need to pip install schedulefree
DEBUG=1 PBS_O_WORKDIR="$(pwd)" DATA_FILE_LIST=./ALCF/data-lists/sunspot/books.txt LR=0.0008 GRAD_ACC_STEPS=8 ZERO_STAGE=1 NUM_LAYERS=10 MICRO_BATCH=8 OPT=adamwschedulefree TIMING_LOG_LEVEL=1 bash train_llama_alcf.sh
export DEBUG=1
export PBS_O_WORKDIR="$(pwd)"
export DATA_FILE_LIST=./ALCF/data-lists/sunspot/books.txt
export ZERO_STAGE=1
export NUM_LAYERS=10
export MICRO_BATCH=8
export TRAIN_ITER=20
export TIMING_LOG_LEVEL=1
bash train_llama_alcf.sh |& tee "test-sunspot-${NOW}.log"
}

main
79 changes: 43 additions & 36 deletions pretrain_gpt_alcf.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
import math
from functools import partial
from megatron import get_args
from megatron import print_rank_0
# from megatron import print_rank_0
from megatron import get_timers
from megatron import get_tokenizer
from megatron.core import mpu, tensor_parallel
Expand All @@ -19,12 +19,12 @@
from megatron.utils import get_ltor_masks_and_position_ids
from megatron.utils import average_losses_across_data_parallel_group, update_rotary_pos_emb
from megatron.arguments import core_transformer_config_from_args
from megatron.utils import (
report_memory,
throughput_calculator,
checkpoint_throughput_calculator
)
from pathlib import Path
# from megatron.utils import (
# # report_memory,
# # throughput_calculator,
# # checkpoint_throughput_calculator
# )
# from pathlib import Path
from enrich import get_logger

import deepspeed
Expand All @@ -33,7 +33,7 @@
import subprocess
import wandb

import time
# import time
from torch import nn
import torch.nn.functional as F
import ezpz as ez
Expand Down Expand Up @@ -74,7 +74,7 @@

def model_provider(pre_process=True, post_process=True):
"""Build the model."""
print_rank_0('building GPT model ...')
log.info('building GPT model ...')
see_memory_usage("Before Building Model", force=True)
args = get_args()
config = core_transformer_config_from_args(args)
Expand Down Expand Up @@ -118,7 +118,7 @@ def model_provider(pre_process=True, post_process=True):
# We need to call model.set_batch_fn after deepspeed.initialize
model._megatron_batch_fn = get_batch_pipe

# Predompute the attention mask and store it in args.
# Precompute the attention mask and store it in args.
# This avoids having to pipeline it
# as an activation during training.
# The mask is constant, and thus we can reuse it.
Expand Down Expand Up @@ -154,12 +154,9 @@ def model_provider(pre_process=True, post_process=True):
)

num_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
# print_rank_0('\n ------------------------ ')
# print_rank_0(f'num of parameters {num_params}')
# print_rank_0('------------------------\n ')
print_rank_0(80 * '-')
print_rank_0(f"Number of parameters in model: {num_params}")
print_rank_0(80 * '-')
log.info(80 * '-')
log.info(f"Number of parameters in model: {num_params}")
log.info(80 * '-')
see_memory_usage("After Building Model", force=True)
if wandb.run is not None:
tbdir = args.tensorboard_dir
Expand Down Expand Up @@ -342,7 +339,7 @@ def loss_func(loss_mask, moe_loss, mos_loss, output_tensor):
'moe loss': moe_loss,
'kd loss': mos_loss
}
print_rank_0(
log.info(
f'>>> total loss: {loss}, '
f'lm loss {averaged_loss[0]}, '
f'kd loss {mos_loss}'
Expand Down Expand Up @@ -419,7 +416,8 @@ def forward_step(data_iterator, model):
# Get the batch.
timers('batch-generator', log_level=2).start()
tokens, labels, loss_mask, attention_mask, position_ids = get_batch(
data_iterator)
data_iterator
)
timers('batch-generator').stop()

if args.data_efficiency_curriculum_learning:
Expand Down Expand Up @@ -492,11 +490,12 @@ def train_valid_test_datasets_provider(train_val_test_num_samples):
"""Build train, valid, and test datasets."""
args = get_args()

print_rank_0('> building train, validation, and test datasets '
'for GPT ...')
log.info(
'> building train, validation, and test datasets for GPT ...'
)
files = []
if args.data_file_list is not None:
print_rank_0(f"Reading datasets from {args.data_file_list}")
log.info(f"Reading datasets from {args.data_file_list}")
with open(args.data_file_list, 'r') as flist:
for f in flist.readlines():
w, fname = f.split()
Expand All @@ -523,8 +522,9 @@ def train_valid_test_datasets_provider(train_val_test_num_samples):
train_data_prefix=args.train_data_path,
valid_data_prefix=args.valid_data_path,
test_data_prefix=args.test_data_path,
data_cache_path=args.data_cache_path)
print_rank_0("> finished creating GPT datasets ...")
data_cache_path=args.data_cache_path,
)
log.info("> finished creating GPT datasets ...")

return train_ds, valid_ds, test_ds

Expand Down Expand Up @@ -566,8 +566,6 @@ def git_ds_info():


def main():
# if RANK == 0:
# setup_wandb()
if os.getenv('TORCH_PROFILER_ENABLED') == '1':
from torch.profiler import profile, record_function, ProfilerActivity
with profile(activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA]) as prof:
Expand All @@ -593,17 +591,26 @@ def main():
# args_defaults={'tokenizer_type': 'GPT2BPETokenizer'},
data_post_process=data_post_process
)
try:
from megatron.text_generation import generate_and_post_process
with torch.autocast(device_type=ez.get_torch_device(), dtype=torch.float16):
response, _, _, _ = generate_and_post_process(model, prompts=["Hello world", "Nature is", "Turing test comprises", "Explain solar eclipse"], tokens_to_generate=32)
if RANK == 0:
log.info(f'generation completed..\n response:{response}')
except ValueError as ve:
log.critical(f'ValueError: {ve}')
pass
# try:
# from megatron.text_generation import generate_and_post_process
# with torch.autocast(device_type=DEVICE, dtype=args.dtype):
# response, _, _, _ = generate_and_post_process(
# model,
# prompts=[
# "Hello world",
# "Nature is",
# "Turing test comprises",
# "Explain solar eclipse"
# ],
# tokens_to_generate=32
# )
# if RANK == 0:
# log.info(f'generation completed..\n response:{response}')
# except ValueError as ve:
# log.critical(f'ValueError: {ve}')
# pass
# dist.barrier()
model.train()
# model.train()
return model


Expand All @@ -623,4 +630,4 @@ def main():
print(f"wandb.run.name: {wandb.run.name}")
print(f"wandb.run.url: {wandb.run.url}")
wandb.finish()
sys.exit()
sys.exit(0)
2 changes: 1 addition & 1 deletion train_llama_alcf.sh
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,7 @@ run_cmd="

# ds_exec
# echo "! Using $(which deepspeed)"
ds_report
# ds_report

echo "${run_cmd}"

Expand Down

0 comments on commit 7681642

Please sign in to comment.