Skip to content

Commit

Permalink
Merge branch 'master' into universal_ckpt_sp
Browse files Browse the repository at this point in the history
  • Loading branch information
samadejacobs authored Dec 14, 2023
2 parents 7d3f5b0 + 3324efd commit 027c49f
Show file tree
Hide file tree
Showing 19 changed files with 403 additions and 14 deletions.
6 changes: 3 additions & 3 deletions .github/workflows/nv-a6000.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,9 @@ name: nv-a6000
on:
pull_request:
paths:
- "deepspeed/inference/v2/**"
- "tests/unit/inference/v2/**"
- ".github/workflows/nv-a6000.yml"
- 'deepspeed/inference/v2/**'
- 'tests/unit/inference/v2/**'
- '.github/workflows/nv-a6000.yml'
workflow_dispatch:

concurrency:
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/nv-accelerate-v100.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ on:
- 'docs/**'
- 'blogs/**'
- 'deepspeed/inference/v2/**'
- "tests/unit/inference/v2/**"
- 'tests/unit/inference/v2/**'
merge_group:
branches: [ master ]
schedule:
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/nv-inference.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ on:
- 'docs/**'
- 'blogs/**'
- 'deepspeed/inference/v2/**'
- "tests/unit/inference/v2/**"
- 'tests/unit/inference/v2/**'
merge_group:
branches: [ master ]
schedule:
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/nv-lightning-v100.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ on:
- 'docs/**'
- 'blogs/**'
- 'deepspeed/inference/v2/**'
- "tests/unit/inference/v2/**"
- 'tests/unit/inference/v2/**'
merge_group:
branches: [ master ]
schedule:
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/nv-megatron.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ on:
- 'docs/**'
- 'blogs/**'
- 'deepspeed/inference/v2/**'
- "tests/unit/inference/v2/**"
- 'tests/unit/inference/v2/**'
merge_group:
branches: [ master ]
schedule:
Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/nv-pre-compile-ops.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ on:
- 'docs/**'
- 'blogs/**'
- 'deepspeed/inference/v2/**'
- "tests/unit/inference/v2/**"
- 'tests/unit/inference/v2/**'
merge_group:
branches: [ master ]
schedule:
Expand All @@ -19,7 +19,7 @@ concurrency:
cancel-in-progress: true

jobs:
build-ops:
unit-tests:
runs-on: ubuntu-20.04
container:
image: deepspeed/gh-builder:ubuntu1804-py38-torch1131-cu116
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/nv-torch-latest-cpu.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ on:
- 'docs/**'
- 'blogs/**'
- 'deepspeed/inference/v2/**'
- "tests/unit/inference/v2/**"
- 'tests/unit/inference/v2/**'
merge_group:
branches: [ master ]
schedule:
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/nv-torch-latest-v100.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ on:
- 'docs/**'
- 'blogs/**'
- 'deepspeed/inference/v2/**'
- "tests/unit/inference/v2/**"
- 'tests/unit/inference/v2/**'
merge_group:
branches: [ master ]
schedule:
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/nv-transformers-v100.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ on:
- 'docs/**'
- 'blogs/**'
- 'deepspeed/inference/v2/**'
- "tests/unit/inference/v2/**"
- 'tests/unit/inference/v2/**'
merge_group:
branches: [ master ]
schedule:
Expand Down
1 change: 1 addition & 0 deletions blogs/deepspeed-fastgen/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -228,6 +228,7 @@ We currently support the following model architectures in this alpha release of
* [LLaMA](https://huggingface.co/models?other=llama) and [LLaMA-2](https://huggingface.co/models?other=llama-2)
* [Mistral](https://huggingface.co/models?other=mistral)
* [OPT](https://huggingface.co/models?other=opt)
* [Falcon](https://huggingface.co/models?other=falcon)

All current models leverage [HuggingFace](https://github.com/huggingface) APIs in our backend to provide both the model weights and the model's corresponding tokenizer.

Expand Down
3 changes: 3 additions & 0 deletions deepspeed/inference/v2/engine_factory.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
OPTPolicy,
Llama2Policy,
MistralPolicy,
FalconPolicy,
)
from .model_implementations.inference_policy_base import POLICIES, InferenceV2Policy
from .model_implementations.flat_model_helpers import make_metadata_filename, ModelMetadata
Expand Down Expand Up @@ -104,6 +105,8 @@ def build_hf_engine(path: str,
assert version.parse(transformers.__version__) >= version.parse("4.34.0"), \
f"Mistral requires transformers >= 4.34.0, you have version {transformers.__version__}"
policy = MistralPolicy(model_config, checkpoint_engine=checkpoint_engine)
elif model_config.model_type == "falcon":
policy = FalconPolicy(model_config, checkpoint_engine=checkpoint_engine)
else:
raise ValueError(f"Unsupported model type {model_config.model_type}")

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -227,6 +227,16 @@ void launch_kv_rotary_kernel(T* kv_cache,
DISPATCH_KV_ROTARY_IMPL(5, 128)
DISPATCH_KV_ROTARY_IMPL(8, 64)
DISPATCH_KV_ROTARY_IMPL(8, 128)
DISPATCH_KV_ROTARY_IMPL(16, 64)
DISPATCH_KV_ROTARY_IMPL(16, 128)
DISPATCH_KV_ROTARY_IMPL(29, 64)
DISPATCH_KV_ROTARY_IMPL(29, 128)
DISPATCH_KV_ROTARY_IMPL(35, 64)
DISPATCH_KV_ROTARY_IMPL(35, 128)
DISPATCH_KV_ROTARY_IMPL(36, 64)
DISPATCH_KV_ROTARY_IMPL(36, 128)
DISPATCH_KV_ROTARY_IMPL(71, 64)
DISPATCH_KV_ROTARY_IMPL(71, 128)
}

#define INSTANTIATE_KV_ROTARY_KERNEL(TYPE) \
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ class BlockedRotaryEmbeddings(DSKernelBase):

supported_dtypes = [DtypeEnum.fp16, DtypeEnum.bf16]
supported_head_sizes = [64, 128]
supported_q_ratios = [1, 2, 4, 5, 8]
supported_q_ratios = [1, 2, 4, 5, 8, 16, 29, 35, 36, 71]

def __init__(self, head_size: int, n_q_heads: int, n_kv_heads: int, dtype: torch.dtype) -> None:
"""
Expand Down
1 change: 1 addition & 0 deletions deepspeed/inference/v2/model_implementations/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,3 +12,4 @@
from .llama_v2 import *
from .opt import *
from .mistral import *
from .falcon import *
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
# Copyright (c) Microsoft Corporation.
# SPDX-License-Identifier: Apache-2.0

# DeepSpeed Team

from .falcon_policy import FalconPolicy
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
# Copyright (c) Microsoft Corporation.
# SPDX-License-Identifier: Apache-2.0

# DeepSpeed Team

# Create a container object to save model-specific tensors using the policy file above.

from ...model_implementations.common_parameters import *
from ...model_implementations.layer_container_base import LayerContainer
'''
# HF Falcon 7b model looks like this:
FalconForCausalLM(
(transformer): FalconModel(
(word_embeddings): Embedding(65024, 4544)
(h): ModuleList(
(0-31): 32 x FalconDecoderLayer(
(self_attention): FalconAttention(
(maybe_rotary): FalconRotaryEmbedding()
(query_key_value): FalconLinear(in_features=4544, out_features=4672, bias=False)
(dense): FalconLinear(in_features=4544, out_features=4544, bias=False)
(attention_dropout): Dropout(p=0.0, inplace=False)
)
(mlp): FalconMLP(
(dense_h_to_4h): FalconLinear(in_features=4544, out_features=18176, bias=False)
(act): GELU(approximate='none')
(dense_4h_to_h): FalconLinear(in_features=18176, out_features=4544, bias=False)
)
(input_layernorm): LayerNorm((4544,), eps=1e-05, elementwise_affine=True)
)
)
(ln_f): LayerNorm((4544,), eps=1e-05, elementwise_affine=True)
)
(lm_head): Linear(in_features=4544, out_features=65024, bias=False)
)
'''


class FalconTransformerContainer(LayerContainer):
"""
Transformer layer container for the Falcon model.
"""
qkv_w: FusedQKVParameter
attn_out_w: AttentionOutputParameter
mlp_1_w: MLP1Parameter
mlp_2_w: MLP2Parameter
ln_attn_gamma: NormParameter
ln_attn_beta: NormParameter

PARAM_MAPPING = {
"self_attention.query_key_value.weight": "qkv_w.params",
"self_attention.dense.weight": "attn_out_w.params",
"mlp.dense_h_to_4h.weight": "mlp_1_w.params",
"mlp.dense_4h_to_h.weight": "mlp_2_w.params",
"input_layernorm.weight": "ln_attn_gamma.params",
"input_layernorm.bias": "ln_attn_beta.params",
}


class FalconNonTransformerContainer(LayerContainer):
"""
Non-Transformer layer container for the Falcon model.
"""
word_emb: EmbeddingParameter
word_unembed: UnembedParameter
final_norm_gamma: NormParameter
final_norm_beta: NormParameter

PARAM_MAPPING = {
"transformer.word_embeddings.weight": "word_emb.params",
"transformer.ln_f.weight": "final_norm_gamma.params",
"transformer.ln_f.bias": "final_norm_beta.params",
"lm_head.weight": "word_unembed.params",
}


'''
# HF Falcon 40b model looks like this:
FalconForCausalLM(
(transformer): FalconModel(
(word_embeddings): Embedding(65024, 8192)
(h): ModuleList(
(0-59): 60 x FalconDecoderLayer(
(self_attention): FalconAttention(
(maybe_rotary): FalconRotaryEmbedding()
(query_key_value): FalconLinear(in_features=8192, out_features=9216, bias=False)
(dense): FalconLinear(in_features=8192, out_features=8192, bias=False)
(attention_dropout): Dropout(p=0.0, inplace=False)
)
(mlp): FalconMLP(
(dense_h_to_4h): FalconLinear(in_features=8192, out_features=32768, bias=False)
(act): GELU(approximate='none')
(dense_4h_to_h): FalconLinear(in_features=32768, out_features=8192, bias=False)
)
(ln_attn): LayerNorm((8192,), eps=1e-05, elementwise_affine=True)
(ln_mlp): LayerNorm((8192,), eps=1e-05, elementwise_affine=True)
)
)
(ln_f): LayerNorm((8192,), eps=1e-05, elementwise_affine=True)
)
(lm_head): Linear(in_features=8192, out_features=65024, bias=False)
)
'''


class FalconNewArchTransformerContainer(LayerContainer):
"""
Transformer layer container for the Falcon model.
"""
qkv_w: GQAMegatronQKVParameter
attn_out_w: AttentionOutputParameter
mlp_1_w: MLP1Parameter
mlp_2_w: MLP2Parameter
ln_attn_gamma: NormParameter
ln_attn_beta: NormParameter
ln_mlp_gamma: NormParameter
ln_mlp_beta: NormParameter

PARAM_MAPPING = {
"self_attention.query_key_value.weight": "qkv_w.params",
"self_attention.dense.weight": "attn_out_w.params",
"mlp.dense_h_to_4h.weight": "mlp_1_w.params",
"mlp.dense_4h_to_h.weight": "mlp_2_w.params",
"ln_attn.weight": "ln_attn_gamma.params",
"ln_attn.bias": "ln_attn_beta.params",
"ln_mlp.weight": "ln_mlp_gamma.params",
"ln_mlp.bias": "ln_mlp_beta.params",
}
Loading

0 comments on commit 027c49f

Please sign in to comment.