Skip to content

Commit

Permalink
Merge branch 'master' into gated_act_fn_fix
Browse files Browse the repository at this point in the history
  • Loading branch information
loadams authored Dec 13, 2023
2 parents 8c9bacb + 3324efd commit 8eb7b73
Show file tree
Hide file tree
Showing 42 changed files with 666 additions and 129 deletions.
6 changes: 3 additions & 3 deletions .github/workflows/nv-a6000.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,9 @@ name: nv-a6000
on:
pull_request:
paths:
- "deepspeed/inference/v2/**"
- "tests/unit/inference/v2/**"
- ".github/workflows/nv-a6000.yml"
- 'deepspeed/inference/v2/**'
- 'tests/unit/inference/v2/**'
- '.github/workflows/nv-a6000.yml'
workflow_dispatch:

concurrency:
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/nv-accelerate-v100.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ on:
- 'docs/**'
- 'blogs/**'
- 'deepspeed/inference/v2/**'
- "tests/unit/inference/v2/**"
- 'tests/unit/inference/v2/**'
merge_group:
branches: [ master ]
schedule:
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/nv-inference.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ on:
- 'docs/**'
- 'blogs/**'
- 'deepspeed/inference/v2/**'
- "tests/unit/inference/v2/**"
- 'tests/unit/inference/v2/**'
merge_group:
branches: [ master ]
schedule:
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/nv-lightning-v100.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ on:
- 'docs/**'
- 'blogs/**'
- 'deepspeed/inference/v2/**'
- "tests/unit/inference/v2/**"
- 'tests/unit/inference/v2/**'
merge_group:
branches: [ master ]
schedule:
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/nv-megatron.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ on:
- 'docs/**'
- 'blogs/**'
- 'deepspeed/inference/v2/**'
- "tests/unit/inference/v2/**"
- 'tests/unit/inference/v2/**'
merge_group:
branches: [ master ]
schedule:
Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/nv-pre-compile-ops.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ on:
- 'docs/**'
- 'blogs/**'
- 'deepspeed/inference/v2/**'
- "tests/unit/inference/v2/**"
- 'tests/unit/inference/v2/**'
merge_group:
branches: [ master ]
schedule:
Expand All @@ -19,7 +19,7 @@ concurrency:
cancel-in-progress: true

jobs:
build-ops:
unit-tests:
runs-on: ubuntu-20.04
container:
image: deepspeed/gh-builder:ubuntu1804-py38-torch1131-cu116
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/nv-torch-latest-cpu.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ on:
- 'docs/**'
- 'blogs/**'
- 'deepspeed/inference/v2/**'
- "tests/unit/inference/v2/**"
- 'tests/unit/inference/v2/**'
merge_group:
branches: [ master ]
schedule:
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/nv-torch-latest-v100.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ on:
- 'docs/**'
- 'blogs/**'
- 'deepspeed/inference/v2/**'
- "tests/unit/inference/v2/**"
- 'tests/unit/inference/v2/**'
merge_group:
branches: [ master ]
schedule:
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/nv-transformers-v100.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ on:
- 'docs/**'
- 'blogs/**'
- 'deepspeed/inference/v2/**'
- "tests/unit/inference/v2/**"
- 'tests/unit/inference/v2/**'
merge_group:
branches: [ master ]
schedule:
Expand Down
4 changes: 2 additions & 2 deletions accelerator/cpu_accelerator.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@ def random(self):
return torch.random

def set_rng_state(self, new_state, device_index=None):
if device_index == None:
if device_index is None:
return torch.set_rng_state(new_state)
return torch.set_rng_state(new_state, device_index)

Expand Down Expand Up @@ -253,7 +253,7 @@ def on_accelerator(self, tensor):
# create an instance of op builder and return, name specified by class_name
def create_op_builder(self, op_name):
builder_class = self.get_op_builder(op_name)
if builder_class != None:
if builder_class is not None:
return builder_class()
return None

Expand Down
4 changes: 2 additions & 2 deletions accelerator/cuda_accelerator.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ def is_synchronized_device(self):

# Device APIs
def device_name(self, device_index=None):
if device_index == None:
if device_index is None:
return 'cuda'
return 'cuda:{}'.format(device_index)

Expand Down Expand Up @@ -280,7 +280,7 @@ def op_builder_dir(self):
class_dict = None

def _lazy_init_class_dict(self):
if self.class_dict != None:
if self.class_dict is not None:
return
else:
self.class_dict = {}
Expand Down
4 changes: 2 additions & 2 deletions accelerator/mps_accelerator.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ def is_synchronized_device(self):

# Device APIs
def device_name(self, device_index=None):
if device_index == None:
if device_index is None:
return "mps"
return "mps:{}".format(device_index)

Expand Down Expand Up @@ -221,7 +221,7 @@ def op_builder_dir(self):
# create an instance of op builder, specified by class_name
def create_op_builder(self, op_name):
builder_class = self.get_op_builder(op_name)
if builder_class != None:
if builder_class is not None:
return builder_class()
return None

Expand Down
2 changes: 1 addition & 1 deletion accelerator/npu_accelerator.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ def is_synchronized_device(self):

# Device APIs
def device_name(self, device_index=None):
if device_index == None:
if device_index is None:
return 'npu'
return 'npu:{}'.format(device_index)

Expand Down
2 changes: 1 addition & 1 deletion accelerator/real_accelerator.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ def _validate_accelerator(accel_obj):


def is_current_accelerator_supported():
return get_accelerator() in SUPPORTED_ACCELERATOR_LIST
return get_accelerator().device_name() in SUPPORTED_ACCELERATOR_LIST


def get_accelerator():
Expand Down
1 change: 1 addition & 0 deletions blogs/deepspeed-fastgen/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -228,6 +228,7 @@ We currently support the following model architectures in this alpha release of
* [LLaMA](https://huggingface.co/models?other=llama) and [LLaMA-2](https://huggingface.co/models?other=llama-2)
* [Mistral](https://huggingface.co/models?other=mistral)
* [OPT](https://huggingface.co/models?other=opt)
* [Falcon](https://huggingface.co/models?other=falcon)

All current models leverage [HuggingFace](https://github.com/huggingface) APIs in our backend to provide both the model weights and the model's corresponding tokenizer.

Expand Down
2 changes: 1 addition & 1 deletion deepspeed/inference/quantization/layers.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ def __init__(self, config: Dict, pre_quant_layer: nn.Embedding) -> None:
device=pre_quant_layer.weight.device,
dtype=pre_quant_layer.weight.dtype)

assert pre_quant_layer.max_norm == None, 'Not supported'
assert pre_quant_layer.max_norm is None, 'Not supported'
assert pre_quant_layer.norm_type == 2, 'Not supported'
assert pre_quant_layer.scale_grad_by_freq == False, 'Not supported'
assert pre_quant_layer.sparse == False, 'Not supported'
Expand Down
3 changes: 3 additions & 0 deletions deepspeed/inference/v2/engine_factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
OPTPolicy,
Llama2Policy,
MistralPolicy,
FalconPolicy,
)
from .model_implementations.inference_policy_base import POLICIES, InferenceV2Policy
from .model_implementations.flat_model_helpers import make_metadata_filename, ModelMetadata
Expand Down Expand Up @@ -104,6 +105,8 @@ def build_hf_engine(path: str,
assert version.parse(transformers.__version__) >= version.parse("4.34.0"), \
f"Mistral requires transformers >= 4.34.0, you have version {transformers.__version__}"
policy = MistralPolicy(model_config, checkpoint_engine=checkpoint_engine)
elif model_config.model_type == "falcon":
policy = FalconPolicy(model_config, checkpoint_engine=checkpoint_engine)
else:
raise ValueError(f"Unsupported model type {model_config.model_type}")

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -227,6 +227,16 @@ void launch_kv_rotary_kernel(T* kv_cache,
DISPATCH_KV_ROTARY_IMPL(5, 128)
DISPATCH_KV_ROTARY_IMPL(8, 64)
DISPATCH_KV_ROTARY_IMPL(8, 128)
DISPATCH_KV_ROTARY_IMPL(16, 64)
DISPATCH_KV_ROTARY_IMPL(16, 128)
DISPATCH_KV_ROTARY_IMPL(29, 64)
DISPATCH_KV_ROTARY_IMPL(29, 128)
DISPATCH_KV_ROTARY_IMPL(35, 64)
DISPATCH_KV_ROTARY_IMPL(35, 128)
DISPATCH_KV_ROTARY_IMPL(36, 64)
DISPATCH_KV_ROTARY_IMPL(36, 128)
DISPATCH_KV_ROTARY_IMPL(71, 64)
DISPATCH_KV_ROTARY_IMPL(71, 128)
}

#define INSTANTIATE_KV_ROTARY_KERNEL(TYPE) \
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ class BlockedRotaryEmbeddings(DSKernelBase):

supported_dtypes = [DtypeEnum.fp16, DtypeEnum.bf16]
supported_head_sizes = [64, 128]
supported_q_ratios = [1, 2, 4, 5, 8]
supported_q_ratios = [1, 2, 4, 5, 8, 16, 29, 35, 36, 71]

def __init__(self, head_size: int, n_q_heads: int, n_kv_heads: int, dtype: torch.dtype) -> None:
"""
Expand Down
1 change: 1 addition & 0 deletions deepspeed/inference/v2/model_implementations/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,3 +12,4 @@
from .llama_v2 import *
from .opt import *
from .mistral import *
from .falcon import *
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
# Copyright (c) Microsoft Corporation.
# SPDX-License-Identifier: Apache-2.0

# DeepSpeed Team

from .falcon_policy import FalconPolicy
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
# Copyright (c) Microsoft Corporation.
# SPDX-License-Identifier: Apache-2.0

# DeepSpeed Team

# Create a container object to save model-specific tensors using the policy file above.

from ...model_implementations.common_parameters import *
from ...model_implementations.layer_container_base import LayerContainer
'''
# HF Falcon 7b model looks like this:
FalconForCausalLM(
(transformer): FalconModel(
(word_embeddings): Embedding(65024, 4544)
(h): ModuleList(
(0-31): 32 x FalconDecoderLayer(
(self_attention): FalconAttention(
(maybe_rotary): FalconRotaryEmbedding()
(query_key_value): FalconLinear(in_features=4544, out_features=4672, bias=False)
(dense): FalconLinear(in_features=4544, out_features=4544, bias=False)
(attention_dropout): Dropout(p=0.0, inplace=False)
)
(mlp): FalconMLP(
(dense_h_to_4h): FalconLinear(in_features=4544, out_features=18176, bias=False)
(act): GELU(approximate='none')
(dense_4h_to_h): FalconLinear(in_features=18176, out_features=4544, bias=False)
)
(input_layernorm): LayerNorm((4544,), eps=1e-05, elementwise_affine=True)
)
)
(ln_f): LayerNorm((4544,), eps=1e-05, elementwise_affine=True)
)
(lm_head): Linear(in_features=4544, out_features=65024, bias=False)
)
'''


class FalconTransformerContainer(LayerContainer):
"""
Transformer layer container for the Falcon model.
"""
qkv_w: FusedQKVParameter
attn_out_w: AttentionOutputParameter
mlp_1_w: MLP1Parameter
mlp_2_w: MLP2Parameter
ln_attn_gamma: NormParameter
ln_attn_beta: NormParameter

PARAM_MAPPING = {
"self_attention.query_key_value.weight": "qkv_w.params",
"self_attention.dense.weight": "attn_out_w.params",
"mlp.dense_h_to_4h.weight": "mlp_1_w.params",
"mlp.dense_4h_to_h.weight": "mlp_2_w.params",
"input_layernorm.weight": "ln_attn_gamma.params",
"input_layernorm.bias": "ln_attn_beta.params",
}


class FalconNonTransformerContainer(LayerContainer):
"""
Non-Transformer layer container for the Falcon model.
"""
word_emb: EmbeddingParameter
word_unembed: UnembedParameter
final_norm_gamma: NormParameter
final_norm_beta: NormParameter

PARAM_MAPPING = {
"transformer.word_embeddings.weight": "word_emb.params",
"transformer.ln_f.weight": "final_norm_gamma.params",
"transformer.ln_f.bias": "final_norm_beta.params",
"lm_head.weight": "word_unembed.params",
}


'''
# HF Falcon 40b model looks like this:
FalconForCausalLM(
(transformer): FalconModel(
(word_embeddings): Embedding(65024, 8192)
(h): ModuleList(
(0-59): 60 x FalconDecoderLayer(
(self_attention): FalconAttention(
(maybe_rotary): FalconRotaryEmbedding()
(query_key_value): FalconLinear(in_features=8192, out_features=9216, bias=False)
(dense): FalconLinear(in_features=8192, out_features=8192, bias=False)
(attention_dropout): Dropout(p=0.0, inplace=False)
)
(mlp): FalconMLP(
(dense_h_to_4h): FalconLinear(in_features=8192, out_features=32768, bias=False)
(act): GELU(approximate='none')
(dense_4h_to_h): FalconLinear(in_features=32768, out_features=8192, bias=False)
)
(ln_attn): LayerNorm((8192,), eps=1e-05, elementwise_affine=True)
(ln_mlp): LayerNorm((8192,), eps=1e-05, elementwise_affine=True)
)
)
(ln_f): LayerNorm((8192,), eps=1e-05, elementwise_affine=True)
)
(lm_head): Linear(in_features=8192, out_features=65024, bias=False)
)
'''


class FalconNewArchTransformerContainer(LayerContainer):
"""
Transformer layer container for the Falcon model.
"""
qkv_w: GQAMegatronQKVParameter
attn_out_w: AttentionOutputParameter
mlp_1_w: MLP1Parameter
mlp_2_w: MLP2Parameter
ln_attn_gamma: NormParameter
ln_attn_beta: NormParameter
ln_mlp_gamma: NormParameter
ln_mlp_beta: NormParameter

PARAM_MAPPING = {
"self_attention.query_key_value.weight": "qkv_w.params",
"self_attention.dense.weight": "attn_out_w.params",
"mlp.dense_h_to_4h.weight": "mlp_1_w.params",
"mlp.dense_4h_to_h.weight": "mlp_2_w.params",
"ln_attn.weight": "ln_attn_gamma.params",
"ln_attn.bias": "ln_attn_beta.params",
"ln_mlp.weight": "ln_mlp_gamma.params",
"ln_mlp.bias": "ln_mlp_beta.params",
}
Loading

0 comments on commit 8eb7b73

Please sign in to comment.