Skip to content

Commit

Permalink
Merge branch 'master' into lyj/fix_falcon40b
Browse files Browse the repository at this point in the history
  • Loading branch information
tjruwase authored Jan 3, 2024
2 parents a5c2fe9 + 81cc320 commit 01df63a
Show file tree
Hide file tree
Showing 111 changed files with 1,749 additions and 670 deletions.
56 changes: 0 additions & 56 deletions .github/workflows/amd-mi100.yml

This file was deleted.

59 changes: 0 additions & 59 deletions .github/workflows/auto-sync.yml

This file was deleted.

2 changes: 1 addition & 1 deletion .github/workflows/formatting.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ concurrency:
jobs:

# formatting and basic install on cpu-only machine
formatting:
unit-tests:
runs-on: ubuntu-20.04

steps:
Expand Down
1 change: 1 addition & 0 deletions .github/workflows/nv-a6000.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ name: nv-a6000
on:
pull_request:
paths:
- 'accelerator/cuda_accelerator.py'
- 'deepspeed/inference/v2/**'
- 'tests/unit/inference/v2/**'
- '.github/workflows/nv-a6000.yml'
Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/nv-accelerate-v100.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ concurrency:

jobs:
unit-tests:
runs-on: [self-hosted, nvidia, cu111, v100]
runs-on: [self-hosted, nvidia, cu116, v100]

steps:
- uses: actions/checkout@v3
Expand All @@ -28,7 +28,7 @@ jobs:

- name: Install pytorch
run: |
pip install -U --cache-dir $TORCH_CACHE torch torchvision --extra-index-url https://download.pytorch.org/whl/cu111
pip install -U --cache-dir $TORCH_CACHE torch --index-url https://download.pytorch.org/whl/cu118
python -c "import torch; print('torch:', torch.__version__, torch)"
python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/python.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ concurrency:
cancel-in-progress: true

jobs:
version-check:
unit-tests:
strategy:
matrix:
pyVersion: ["3.6", "3.7", "3.8", "3.9", "3.10"]
Expand Down
62 changes: 31 additions & 31 deletions CODEOWNERS
Validating CODEOWNERS rules …
Original file line number Diff line number Diff line change
Expand Up @@ -7,50 +7,50 @@


# top-level repo folders
/.github/ @jeffra @mrwyattii @loadams
/azure/ @jeffra @awan-10
/benchmarks/ @jeffra @awan-10 @mrwyattii @molly-smith
/bin/ @jeffra
/csrc/ @RezaYazdaniAminabadi @awan-10 @jeffra @cmikeh2 @arashb
/deepspeed/ @jeffra
/docker/ @jeffra @awan-10
/docs/ @jeffra @mrwyattii
/examples/ @jeffra @awan-10 @mrwyattii
/op_builder/ @jeffra @RezaYazdaniAminabadi @cmikeh2
/release/ @jeffra @mrwyattii
/requirements/ @jeffra @mrwyattii
/scripts/ @jeffra @awan-10
/tests/ @jeffra @mrwyattii @tjruwase
/.github/ @mrwyattii @loadams
/azure/ @mrwyattii @awan-10
/benchmarks/ @awan-10 @mrwyattii
/bin/ @mrwyattii
/csrc/ @awan-10 @mrwyattii @cmikeh2 @arashb
/deepspeed/ @mrwyattii
/docker/ @mrwyattii @awan-10
/docs/ @mrwyattii
/examples/ @awan-10 @mrwyattii
/op_builder/ @mrwyattii @cmikeh2
/release/ @loadams @mrwyattii
/requirements/ @loadams @mrwyattii
/scripts/ @mrwyattii @awan-10
/tests/ @mrwyattii @tjruwase @loadams

# deepspeed
/deepspeed/autotuning/ @cli99
/deepspeed/autotuning/ @mrwyattii
/deepspeed/checkpoint/ @tjruwase
/deepspeed/comm/ @awan-10
/deepspeed/compression/ @yaozhewei @minjiaz @xiaoxiawu-microsoft @conglongli
/deepspeed/elasticity/ @jeffra @awan-10
/deepspeed/launcher/ @jeffra @awan-10
/deepspeed/module_inject/ @RezaYazdaniAminabadi @jeffra @mrwyattii @awan-10 @cmikeh2 @arashb
/deepspeed/compression/ @minjiaz @xiaoxiawu-microsoft @conglongli
/deepspeed/elasticity/ @mrwyattii @awan-10
/deepspeed/launcher/ @mrwyattii @awan-10
/deepspeed/module_inject/ @mrwyattii @awan-10 @cmikeh2 @arashb
/deepspeed/moe/ @awan-10
/deepspeed/monitor/ @awan-10 @jeffra
/deepspeed/nebula/ @tjruwase @jeffra
/deepspeed/ops/ @RezaYazdaniAminabadi @jeffra @mrwyattii @awan-10 @cmikeh2 @arashb
/deepspeed/monitor/ @awan-10 @mrwyattii
/deepspeed/nebula/ @tjruwase @mrwyattii
/deepspeed/ops/ @mrwyattii @awan-10 @cmikeh2 @arashb
/deepspeed/pipe/ @ShadenSmith @duli2012
/deepspeed/profiling/ @cli99
/deepspeed/utils/ @jeffra @tjruwase @awan-10
/deepspeed/profiling/ @ShijieZZZZ
/deepspeed/utils/ @mrwyattii @tjruwase @awan-10

# inference
/deepspeed/inference/ @RezaYazdaniAminabadi @jeffra @mrwyattii @awan-10 @cmikeh2 @arashb
/deepspeed/model_implementations/ @RezaYazdaniAminabadi @jeffra @mrwyattii @awan-10 @cmikeh2 @arashb
/deepspeed/inference/ @mrwyattii @awan-10 @cmikeh2 @arashb
/deepspeed/model_implementations/ @mrwyattii @awan-10 @cmikeh2 @arashb

# training
/deepspeed/runtime/ @jeffra @tjruwase
/deepspeed/runtime/activation_checkpointing/ @jeffra @tjruwase
/deepspeed/runtime/checkpoint_engine/ @tjruwase @jeffra
/deepspeed/runtime/ @mrwyattii @tjruwase
/deepspeed/runtime/activation_checkpointing/ @mrwyattii @tjruwase
/deepspeed/runtime/checkpoint_engine/ @tjruwase @mrwyattii
/deepspeed/runtime/comm/ @awan-10
/deepspeed/runtime/compression/ @awan-10 @conglongli
/deepspeed/runtime/data_pipeline/ @conglongli
/deepspeed/runtime/fp16/ @jeffra @tjruwase
/deepspeed/runtime/fp16/ @mrwyattii @tjruwase
/deepspeed/runtime/fp16/onebit/ @conglongli @awan-10
/deepspeed/runtime/pipe/ @ShadenSmith @duli2012
/deepspeed/runtime/swap_tensor/ @tjruwase @mrwyattii
/deepspeed/runtime/zero/ @jeffra @tjruwase @samyam @mrwyattii
/deepspeed/runtime/zero/ @tjruwase @mrwyattii
17 changes: 17 additions & 0 deletions accelerator/abstract_accelerator.py
Original file line number Diff line number Diff line change
Expand Up @@ -193,6 +193,19 @@ def communication_backend_name(self):
def is_triton_supported(self):
...

# Graph operations
@abc.abstractmethod
def create_graph(self):
...

@abc.abstractmethod
def capture_to_graph(self, graph, pool=None, stream=None):
...

@abc.abstractmethod
def replay_graph(self, graph):
...

# Tensor operations
@property
@abc.abstractmethod
Expand Down Expand Up @@ -258,3 +271,7 @@ def get_op_builder(self, class_name):
@abc.abstractmethod
def build_extension(self):
...

@abc.abstractmethod
def export_envs(self):
...
15 changes: 14 additions & 1 deletion accelerator/cpu_accelerator.py
Original file line number Diff line number Diff line change
Expand Up @@ -198,8 +198,18 @@ def is_fp16_supported(self):
def supported_dtypes(self):
return [torch.float, torch.bfloat16]

# Tensor operations
# Graph operations
def create_graph(self):
return None

def capture_to_graph(self, graph, pool=None, stream=None):
from deepspeed.runtime.utils import noop_context
return noop_context()

def replay_graph(self, graph):
return

# Tensor operations
@property
def BFloat16Tensor(self):
return torch.BFloat16Tensor
Expand Down Expand Up @@ -280,3 +290,6 @@ def get_op_builder(self, class_name):
def build_extension(self):
from torch.utils.cpp_extension import BuildExtension
return BuildExtension

def export_envs(self):
return []
14 changes: 14 additions & 0 deletions accelerator/cuda_accelerator.py
Original file line number Diff line number Diff line change
Expand Up @@ -222,6 +222,17 @@ def is_triton_supported(self):
else:
return False

# Graph operations
def create_graph(self):
return torch.cuda.CUDAGraph()

def capture_to_graph(self, graph, pool=None, stream=None):
return torch.cuda.graph(graph, pool, stream)

def replay_graph(self, graph):
graph.replay()
return

# Tensor operations

@property
Expand Down Expand Up @@ -322,3 +333,6 @@ def get_op_builder(self, class_name):
def build_extension(self):
from torch.utils.cpp_extension import BuildExtension
return BuildExtension

def export_envs(self):
return ['NCCL']
14 changes: 14 additions & 0 deletions accelerator/mps_accelerator.py
Original file line number Diff line number Diff line change
Expand Up @@ -166,6 +166,17 @@ def communication_backend_name(self):
def is_triton_supported(self):
return False

# Graph operations
def create_graph(self):
return None

def capture_to_graph(self, graph, pool=None, stream=None):
from deepspeed.runtime.utils import noop_context
return noop_context()

def replay_graph(self, graph):
return

# Tensor operations
@property
def BFloat16Tensor(self):
Expand Down Expand Up @@ -235,3 +246,6 @@ def build_extension(self):
from torch.utils.cpp_extension import BuildExtension

return BuildExtension

def export_envs(self):
return []
14 changes: 14 additions & 0 deletions accelerator/npu_accelerator.py
Original file line number Diff line number Diff line change
Expand Up @@ -176,6 +176,17 @@ def communication_backend_name(self):
def is_triton_supported(self):
return False

# Graph operations
def create_graph(self):
return None

def capture_to_graph(self, graph, pool=None, stream=None):
from deepspeed.runtime.utils import noop_context
return noop_context()

def replay_graph(self, graph):
return

# Tensor operations

@property
Expand Down Expand Up @@ -255,3 +266,6 @@ def get_op_builder(self, class_name):
def build_extension(self):
from torch.utils.cpp_extension import BuildExtension
return BuildExtension

def export_envs(self):
return ['ASCEND', 'HCCL', 'LD_LIBRARY', 'PATH']
3 changes: 2 additions & 1 deletion csrc/includes/cpu_adagrad.h
Original file line number Diff line number Diff line change
Expand Up @@ -194,14 +194,15 @@ void Adagrad_Optimizer::Step_AVX(size_t* rounded_size,
#elif defined(__ENABLE_CANN__)
if (dev_params) {
size_t memcpy_size = copy_size * sizeof(_doubled_buffer[_buf_index][0]);
if (half_precision) memoryCopySize /= 2;
if (half_precision) memcpy_size /= 2;
aclrtMemcpy(dev_params + t,
memcpy_size,
_doubled_buffer[_buf_index],
memcpy_size,
aclrtMemcpyKind::ACL_MEMCPY_HOST_TO_DEVICE);

_buf_index = !_buf_index;
}
#endif
}
*rounded_size = new_rounded_size;
Expand Down
Loading

0 comments on commit 01df63a

Please sign in to comment.