Skip to content

Commit

Permalink
Merge branch 'master' into graph_capture
Browse files Browse the repository at this point in the history
  • Loading branch information
inkcherry authored Dec 19, 2023
2 parents a7f5c10 + a00bdde commit 338ebd5
Show file tree
Hide file tree
Showing 30 changed files with 249 additions and 221 deletions.
56 changes: 0 additions & 56 deletions .github/workflows/amd-mi100.yml

This file was deleted.

59 changes: 0 additions & 59 deletions .github/workflows/auto-sync.yml

This file was deleted.

2 changes: 1 addition & 1 deletion .github/workflows/formatting.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ concurrency:
jobs:

# formatting and basic install on cpu-only machine
formatting:
unit-tests:
runs-on: ubuntu-20.04

steps:
Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/nv-accelerate-v100.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ concurrency:

jobs:
unit-tests:
runs-on: [self-hosted, nvidia, cu111, v100]
runs-on: [self-hosted, nvidia, cu116, v100]

steps:
- uses: actions/checkout@v3
Expand All @@ -28,7 +28,7 @@ jobs:

- name: Install pytorch
run: |
pip install -U --cache-dir $TORCH_CACHE torch torchvision --extra-index-url https://download.pytorch.org/whl/cu111
pip install -U --cache-dir $TORCH_CACHE torch --index-url https://download.pytorch.org/whl/cu118
python -c "import torch; print('torch:', torch.__version__, torch)"
python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/python.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ concurrency:
cancel-in-progress: true

jobs:
version-check:
unit-tests:
strategy:
matrix:
pyVersion: ["3.6", "3.7", "3.8", "3.9", "3.10"]
Expand Down
62 changes: 31 additions & 31 deletions CODEOWNERS
Original file line number Diff line number Diff line change
Expand Up @@ -7,50 +7,50 @@


# top-level repo folders
/.github/ @jeffra @mrwyattii @loadams
/azure/ @jeffra @awan-10
/benchmarks/ @jeffra @awan-10 @mrwyattii @molly-smith
/bin/ @jeffra
/csrc/ @RezaYazdaniAminabadi @awan-10 @jeffra @cmikeh2 @arashb
/deepspeed/ @jeffra
/docker/ @jeffra @awan-10
/docs/ @jeffra @mrwyattii
/examples/ @jeffra @awan-10 @mrwyattii
/op_builder/ @jeffra @RezaYazdaniAminabadi @cmikeh2
/release/ @jeffra @mrwyattii
/requirements/ @jeffra @mrwyattii
/scripts/ @jeffra @awan-10
/tests/ @jeffra @mrwyattii @tjruwase
/.github/ @mrwyattii @loadams
/azure/ @mrwyattii @awan-10
/benchmarks/ @awan-10 @mrwyattii
/bin/ @mrwyattii
/csrc/ @awan-10 @mrwyattii @cmikeh2 @arashb
/deepspeed/ @mrwyattii
/docker/ @mrwyattii @awan-10
/docs/ @mrwyattii
/examples/ @awan-10 @mrwyattii
/op_builder/ @mrwyattii @cmikeh2
/release/ @loadams @mrwyattii
/requirements/ @loadams @mrwyattii
/scripts/ @mrwyattii @awan-10
/tests/ @mrwyattii @tjruwase @loadams

# deepspeed
/deepspeed/autotuning/ @cli99
/deepspeed/autotuning/ @mrwyattii
/deepspeed/checkpoint/ @tjruwase
/deepspeed/comm/ @awan-10
/deepspeed/compression/ @yaozhewei @minjiaz @xiaoxiawu-microsoft @conglongli
/deepspeed/elasticity/ @jeffra @awan-10
/deepspeed/launcher/ @jeffra @awan-10
/deepspeed/module_inject/ @RezaYazdaniAminabadi @jeffra @mrwyattii @awan-10 @cmikeh2 @arashb
/deepspeed/compression/ @minjiaz @xiaoxiawu-microsoft @conglongli
/deepspeed/elasticity/ @mrwyattii @awan-10
/deepspeed/launcher/ @mrwyattii @awan-10
/deepspeed/module_inject/ @mrwyattii @awan-10 @cmikeh2 @arashb
/deepspeed/moe/ @awan-10
/deepspeed/monitor/ @awan-10 @jeffra
/deepspeed/nebula/ @tjruwase @jeffra
/deepspeed/ops/ @RezaYazdaniAminabadi @jeffra @mrwyattii @awan-10 @cmikeh2 @arashb
/deepspeed/monitor/ @awan-10 @mrwyattii
/deepspeed/nebula/ @tjruwase @mrwyattii
/deepspeed/ops/ @mrwyattii @awan-10 @cmikeh2 @arashb
/deepspeed/pipe/ @ShadenSmith @duli2012
/deepspeed/profiling/ @cli99
/deepspeed/utils/ @jeffra @tjruwase @awan-10
/deepspeed/profiling/ @ShijieZZZZ
/deepspeed/utils/ @mrwyattii @tjruwase @awan-10

# inference
/deepspeed/inference/ @RezaYazdaniAminabadi @jeffra @mrwyattii @awan-10 @cmikeh2 @arashb
/deepspeed/model_implementations/ @RezaYazdaniAminabadi @jeffra @mrwyattii @awan-10 @cmikeh2 @arashb
/deepspeed/inference/ @mrwyattii @awan-10 @cmikeh2 @arashb
/deepspeed/model_implementations/ @mrwyattii @awan-10 @cmikeh2 @arashb

# training
/deepspeed/runtime/ @jeffra @tjruwase
/deepspeed/runtime/activation_checkpointing/ @jeffra @tjruwase
/deepspeed/runtime/checkpoint_engine/ @tjruwase @jeffra
/deepspeed/runtime/ @mrwyattii @tjruwase
/deepspeed/runtime/activation_checkpointing/ @mrwyattii @tjruwase
/deepspeed/runtime/checkpoint_engine/ @tjruwase @mrwyattii
/deepspeed/runtime/comm/ @awan-10
/deepspeed/runtime/compression/ @awan-10 @conglongli
/deepspeed/runtime/data_pipeline/ @conglongli
/deepspeed/runtime/fp16/ @jeffra @tjruwase
/deepspeed/runtime/fp16/ @mrwyattii @tjruwase
/deepspeed/runtime/fp16/onebit/ @conglongli @awan-10
/deepspeed/runtime/pipe/ @ShadenSmith @duli2012
/deepspeed/runtime/swap_tensor/ @tjruwase @mrwyattii
/deepspeed/runtime/zero/ @jeffra @tjruwase @samyam @mrwyattii
/deepspeed/runtime/zero/ @tjruwase @mrwyattii
36 changes: 24 additions & 12 deletions deepspeed/checkpoint/deepspeed_checkpoint.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,10 @@ class DeepSpeedCheckpoint(object):

def __init__(self, dir, tp_degree=None, pp_degree=None, dp_degree=None):
self.dir = dir
self._validate_folder(dir)

pipeline_parallel = len(get_files_with_prefix(get_files(dir), LAYER_FILE_PREFIX)) > 0

self._validate_folder(dir, pipeline_parallel)

self.zero_checkpoint = ZeROCheckpoint(dir)

Expand Down Expand Up @@ -193,7 +196,10 @@ def get_final_norm_files(self, tp_index: int) -> list:
return self.tp_to_final_norm_map[tp_index]

def _build_tp_other_layer_map(self, layer_index: int):
assert layer_index < len(self.layer_files)
data_map = {}
if len(self.layer_files) < 1:
return data_map
assert layer_index <= len(self.layer_files)
layer_files = get_files_with_prefix(self.layer_files, self.layer_keys[layer_index])
layer_file_partitions = partition_data(layer_files, self.tp_degree)
data_map = {i: flist for i, flist in enumerate(layer_file_partitions)}
Expand All @@ -207,9 +213,13 @@ def get_2d_parallel_files(self, tp_index: int, pp_index: int) -> list:

def _build_pp_transformer_map(self):
data_map = {}
transformer_layers = self.layer_keys[1:-1]
layers_per_pp = len(transformer_layers) // self.pp_degree
data_map = {i: transformer_layers[i * layers_per_pp:(i + 1) * layers_per_pp] for i in range(0, self.pp_degree)}
if self.pp_degree > 0:
transformer_layers = self.layer_keys[1:-1]
layers_per_pp = len(transformer_layers) // self.pp_degree
data_map = {
i: transformer_layers[i * layers_per_pp:(i + 1) * layers_per_pp]
for i in range(0, self.pp_degree)
}
return data_map

def _dump_mapping(self, data_map, map_tag=None):
Expand All @@ -222,9 +232,9 @@ def _build_transformer_file_map(self):
transformer_layer_keys = self.layer_keys[1:-1]
file_map = {}
# XXX: this is not guaranteed
layers_per_pp = len(transformer_layer_keys) // self.pp_degree
if layers_per_pp == 0:
layers_per_pp = 1
layers_per_pp = 1
if self.pp_degree > 0:
layers_per_pp = len(transformer_layer_keys) // self.pp_degree
#print(f"{transformer_layer_keys} {layers_per_pp}")
for key_index, layer_key in enumerate(transformer_layer_keys):
pp_index = key_index // layers_per_pp
Expand All @@ -240,8 +250,8 @@ def _build_transformer_file_map(self):

def _sanity_check(self):
assert len(self.mp_rank_files) % self.tp_degree == 0
assert len(self.layer_keys) > 2
assert self.zero_checkpoint.num_files % (self.pp_degree * self.tp_degree) == 0
assert self.zero_checkpoint.num_files % (self.tp_degree) == 0
# XXX: fix me - isn't always the case
# only true with --pp-partition-method 'type:transformer|embedding' \
# assert (len(self.layer_keys) - 2) % self.pp_degree == 0
Expand Down Expand Up @@ -270,12 +280,14 @@ def _merge_state_dicts(self, sd_list):

return merged_sd

def _validate_folder(self, dir):
def _validate_folder(self, dir, pipeline_parallel):
basic_folder_validation(dir)

file_list = get_files(dir)

for file_prefix in [MODEL_FILE_PREFIX, LAYER_FILE_PREFIX, f'{LAYER_FILE_PREFIX}01']:
file_prefix_list = [MODEL_FILE_PREFIX]
if pipeline_parallel:
file_prefix_list.extend([LAYER_FILE_PREFIX, f'{LAYER_FILE_PREFIX}01'])
for file_prefix in file_prefix_list:
ckpt_files = get_files_with_prefix(file_list, file_prefix)
assert len(
ckpt_files
Expand Down
8 changes: 4 additions & 4 deletions deepspeed/checkpoint/ds_to_universal.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
import shutil
import torch
import tqdm
# from pprint import pprint
#from pprint import pprint

from deepspeed.checkpoint import DeepSpeedCheckpoint
from deepspeed.checkpoint import (
Expand Down Expand Up @@ -241,9 +241,9 @@ def _extract_zero_shard_files(args, ds_checkpoint, temp_dir):
_3d_range_list = list(
itertools.product(range(ds_checkpoint.pp_degree), range(ds_checkpoint.tp_degree),
range(ds_checkpoint.dp_degree)))
# pprint(f'{_3d_range_list=}')
#pprint(f'{_3d_range_list=}')
work_chunks = list(_get_chunks(_3d_range_list, args.num_extract_workers))
# pprint(f'{work_chunks=}')
#pprint(f'{work_chunks=}')

# extract_zero_shards(temp_dir, ds_checkpoint, _3d_range_list[0])
do_work = partial(extract_zero_shards, temp_dir, ds_checkpoint)
Expand Down Expand Up @@ -309,7 +309,7 @@ def main():
print('*** 1. Extracting ZeRO fragments')
_extract_zero_shard_files(args, ds_checkpoint, temp_dir)

print('*** 2. Merging slices')
print('*** 2. Merging slices .....')
_merge_tp_slice_files(args, ds_checkpoint, slice_shapes, temp_dir)

print('*** 3. Saving common optimizer states')
Expand Down
2 changes: 1 addition & 1 deletion deepspeed/checkpoint/reshape_3d_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ def get_model_3d_descriptor(dir):
else:
tp_degree = len(get_files_with_prefix(file_list, MODEL_FILE_PREFIX))
dp_degree = max(1, len(zero_file_list) // tp_degree)
pp_degree = 0
pp_degree = 1

return model_3d_desc(pp_degree, tp_degree, dp_degree)

Expand Down
2 changes: 1 addition & 1 deletion deepspeed/checkpoint/universal_checkpoint.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,8 @@ def load_hp_checkpoint_state(self, folder, tp_rank, tp_world_size):
hp_mapping = self._hp_mapping
optim_state_keys = hp_mapping.get_optim_state_keys()
hp_keys = [FP32_WEIGHT_KEY] + optim_state_keys
#print(f'{hp_keys=}')
checkpoint_files = {key: os.path.join(folder, f"{key}.pt") for key in hp_keys}

for file in checkpoint_files.values():
assert os.path.isfile(file), f'{file} is not a valid file'

Expand Down
Loading

0 comments on commit 338ebd5

Please sign in to comment.