Merge branch 'master' into universal_ckpt_sp

microsoft · Dec 14, 2023 · 027c49f · 027c49f
2 parents 7d3f5b0 + 3324efd
commit 027c49f
Show file tree

Hide file tree

Showing 19 changed files with 403 additions and 14 deletions.
diff --git a/.github/workflows/nv-a6000.yml b/.github/workflows/nv-a6000.yml
@@ -3,9 +3,9 @@ name: nv-a6000
 on:
   pull_request:
     paths:
-      - "deepspeed/inference/v2/**"
-      - "tests/unit/inference/v2/**"
-      - ".github/workflows/nv-a6000.yml"
+      - 'deepspeed/inference/v2/**'
+      - 'tests/unit/inference/v2/**'
+      - '.github/workflows/nv-a6000.yml'
   workflow_dispatch:
 
 concurrency:

diff --git a/.github/workflows/nv-accelerate-v100.yml b/.github/workflows/nv-accelerate-v100.yml
@@ -6,7 +6,7 @@ on:
       - 'docs/**'
       - 'blogs/**'
       - 'deepspeed/inference/v2/**'
-      - "tests/unit/inference/v2/**"
+      - 'tests/unit/inference/v2/**'
   merge_group:
     branches: [ master ]
   schedule:

diff --git a/.github/workflows/nv-inference.yml b/.github/workflows/nv-inference.yml
@@ -6,7 +6,7 @@ on:
       - 'docs/**'
       - 'blogs/**'
       - 'deepspeed/inference/v2/**'
-      - "tests/unit/inference/v2/**"
+      - 'tests/unit/inference/v2/**'
   merge_group:
     branches: [ master ]
   schedule:

diff --git a/.github/workflows/nv-lightning-v100.yml b/.github/workflows/nv-lightning-v100.yml
@@ -6,7 +6,7 @@ on:
       - 'docs/**'
       - 'blogs/**'
       - 'deepspeed/inference/v2/**'
-      - "tests/unit/inference/v2/**"
+      - 'tests/unit/inference/v2/**'
   merge_group:
     branches: [ master ]
   schedule:

diff --git a/.github/workflows/nv-megatron.yml b/.github/workflows/nv-megatron.yml
@@ -6,7 +6,7 @@ on:
       - 'docs/**'
       - 'blogs/**'
       - 'deepspeed/inference/v2/**'
-      - "tests/unit/inference/v2/**"
+      - 'tests/unit/inference/v2/**'
   merge_group:
     branches: [ master ]
   schedule:

diff --git a/.github/workflows/nv-pre-compile-ops.yml b/.github/workflows/nv-pre-compile-ops.yml
@@ -8,7 +8,7 @@ on:
       - 'docs/**'
       - 'blogs/**'
       - 'deepspeed/inference/v2/**'
-      - "tests/unit/inference/v2/**"
+      - 'tests/unit/inference/v2/**'
   merge_group:
     branches: [ master ]
   schedule:
@@ -19,7 +19,7 @@ concurrency:
   cancel-in-progress: true
 
 jobs:
-  build-ops:
+  unit-tests:
     runs-on: ubuntu-20.04
     container:
       image: deepspeed/gh-builder:ubuntu1804-py38-torch1131-cu116

diff --git a/.github/workflows/nv-torch-latest-cpu.yml b/.github/workflows/nv-torch-latest-cpu.yml
@@ -6,7 +6,7 @@ on:
       - 'docs/**'
       - 'blogs/**'
       - 'deepspeed/inference/v2/**'
-      - "tests/unit/inference/v2/**"
+      - 'tests/unit/inference/v2/**'
   merge_group:
     branches: [ master ]
   schedule:

diff --git a/.github/workflows/nv-torch-latest-v100.yml b/.github/workflows/nv-torch-latest-v100.yml
@@ -6,7 +6,7 @@ on:
       - 'docs/**'
       - 'blogs/**'
       - 'deepspeed/inference/v2/**'
-      - "tests/unit/inference/v2/**"
+      - 'tests/unit/inference/v2/**'
   merge_group:
     branches: [ master ]
   schedule:

diff --git a/.github/workflows/nv-transformers-v100.yml b/.github/workflows/nv-transformers-v100.yml
@@ -6,7 +6,7 @@ on:
       - 'docs/**'
       - 'blogs/**'
       - 'deepspeed/inference/v2/**'
-      - "tests/unit/inference/v2/**"
+      - 'tests/unit/inference/v2/**'
   merge_group:
     branches: [ master ]
   schedule:

diff --git a/blogs/deepspeed-fastgen/README.md b/blogs/deepspeed-fastgen/README.md
@@ -228,6 +228,7 @@ We currently support the following model architectures in this alpha release of
 * [LLaMA](https://huggingface.co/models?other=llama) and [LLaMA-2](https://huggingface.co/models?other=llama-2)
 * [Mistral](https://huggingface.co/models?other=mistral)
 * [OPT](https://huggingface.co/models?other=opt)
+* [Falcon](https://huggingface.co/models?other=falcon)
 
 All current models leverage [HuggingFace](https://github.com/huggingface) APIs in our backend to provide both the model weights and the model's corresponding tokenizer.
 

diff --git a/deepspeed/inference/v2/engine_factory.py b/deepspeed/inference/v2/engine_factory.py
@@ -17,6 +17,7 @@
     OPTPolicy,
     Llama2Policy,
     MistralPolicy,
+    FalconPolicy,
 )
 from .model_implementations.inference_policy_base import POLICIES, InferenceV2Policy
 from .model_implementations.flat_model_helpers import make_metadata_filename, ModelMetadata
@@ -104,6 +105,8 @@ def build_hf_engine(path: str,
             assert version.parse(transformers.__version__) >= version.parse("4.34.0"), \
                 f"Mistral requires transformers >= 4.34.0, you have version {transformers.__version__}"
             policy = MistralPolicy(model_config, checkpoint_engine=checkpoint_engine)
+        elif model_config.model_type == "falcon":
+            policy = FalconPolicy(model_config, checkpoint_engine=checkpoint_engine)
         else:
             raise ValueError(f"Unsupported model type {model_config.model_type}")
 

diff --git a/deepspeed/inference/v2/kernels/ragged_ops/linear_blocked_kv_rotary/blocked_kv_rotary.cu b/deepspeed/inference/v2/kernels/ragged_ops/linear_blocked_kv_rotary/blocked_kv_rotary.cu
@@ -227,6 +227,16 @@ void launch_kv_rotary_kernel(T* kv_cache,
     DISPATCH_KV_ROTARY_IMPL(5, 128)
     DISPATCH_KV_ROTARY_IMPL(8, 64)
     DISPATCH_KV_ROTARY_IMPL(8, 128)
+    DISPATCH_KV_ROTARY_IMPL(16, 64)
+    DISPATCH_KV_ROTARY_IMPL(16, 128)
+    DISPATCH_KV_ROTARY_IMPL(29, 64)
+    DISPATCH_KV_ROTARY_IMPL(29, 128)
+    DISPATCH_KV_ROTARY_IMPL(35, 64)
+    DISPATCH_KV_ROTARY_IMPL(35, 128)
+    DISPATCH_KV_ROTARY_IMPL(36, 64)
+    DISPATCH_KV_ROTARY_IMPL(36, 128)
+    DISPATCH_KV_ROTARY_IMPL(71, 64)
+    DISPATCH_KV_ROTARY_IMPL(71, 128)
 }
 
 #define INSTANTIATE_KV_ROTARY_KERNEL(TYPE)                                        \

diff --git a/deepspeed/inference/v2/kernels/ragged_ops/linear_blocked_kv_rotary/blocked_kv_rotary.py b/deepspeed/inference/v2/kernels/ragged_ops/linear_blocked_kv_rotary/blocked_kv_rotary.py
@@ -19,7 +19,7 @@ class BlockedRotaryEmbeddings(DSKernelBase):
 
     supported_dtypes = [DtypeEnum.fp16, DtypeEnum.bf16]
     supported_head_sizes = [64, 128]
-    supported_q_ratios = [1, 2, 4, 5, 8]
+    supported_q_ratios = [1, 2, 4, 5, 8, 16, 29, 35, 36, 71]
 
     def __init__(self, head_size: int, n_q_heads: int, n_kv_heads: int, dtype: torch.dtype) -> None:
         """

diff --git a/deepspeed/inference/v2/model_implementations/__init__.py b/deepspeed/inference/v2/model_implementations/__init__.py
@@ -12,3 +12,4 @@
 from .llama_v2 import *
 from .opt import *
 from .mistral import *
+from .falcon import *
diff --git a/deepspeed/inference/v2/model_implementations/falcon/__init__.py b/deepspeed/inference/v2/model_implementations/falcon/__init__.py
@@ -0,0 +1,6 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+from .falcon_policy import FalconPolicy
diff --git a/deepspeed/inference/v2/model_implementations/falcon/falcon_containers.py b/deepspeed/inference/v2/model_implementations/falcon/falcon_containers.py
@@ -0,0 +1,129 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+
+# Create a container object to save model-specific tensors using the policy file above.
+
+from ...model_implementations.common_parameters import *
+from ...model_implementations.layer_container_base import LayerContainer
+'''
+ # HF Falcon 7b model looks like this:
+
+FalconForCausalLM(
+  (transformer): FalconModel(
+    (word_embeddings): Embedding(65024, 4544)
+    (h): ModuleList(
+      (0-31): 32 x FalconDecoderLayer(
+        (self_attention): FalconAttention(
+          (maybe_rotary): FalconRotaryEmbedding()
+          (query_key_value): FalconLinear(in_features=4544, out_features=4672, bias=False)
+          (dense): FalconLinear(in_features=4544, out_features=4544, bias=False)
+          (attention_dropout): Dropout(p=0.0, inplace=False)
+        )
+        (mlp): FalconMLP(
+          (dense_h_to_4h): FalconLinear(in_features=4544, out_features=18176, bias=False)
+          (act): GELU(approximate='none')
+          (dense_4h_to_h): FalconLinear(in_features=18176, out_features=4544, bias=False)
+        )
+        (input_layernorm): LayerNorm((4544,), eps=1e-05, elementwise_affine=True)
+      )
+    )
+    (ln_f): LayerNorm((4544,), eps=1e-05, elementwise_affine=True)
+  )
+  (lm_head): Linear(in_features=4544, out_features=65024, bias=False)
+)
+'''
+
+
+class FalconTransformerContainer(LayerContainer):
+    """
+        Transformer layer container for the Falcon model.
+    """
+    qkv_w: FusedQKVParameter
+    attn_out_w: AttentionOutputParameter
+    mlp_1_w: MLP1Parameter
+    mlp_2_w: MLP2Parameter
+    ln_attn_gamma: NormParameter
+    ln_attn_beta: NormParameter
+
+    PARAM_MAPPING = {
+        "self_attention.query_key_value.weight": "qkv_w.params",
+        "self_attention.dense.weight": "attn_out_w.params",
+        "mlp.dense_h_to_4h.weight": "mlp_1_w.params",
+        "mlp.dense_4h_to_h.weight": "mlp_2_w.params",
+        "input_layernorm.weight": "ln_attn_gamma.params",
+        "input_layernorm.bias": "ln_attn_beta.params",
+    }
+
+
+class FalconNonTransformerContainer(LayerContainer):
+    """
+        Non-Transformer layer container for the Falcon model.
+    """
+    word_emb: EmbeddingParameter
+    word_unembed: UnembedParameter
+    final_norm_gamma: NormParameter
+    final_norm_beta: NormParameter
+
+    PARAM_MAPPING = {
+        "transformer.word_embeddings.weight": "word_emb.params",
+        "transformer.ln_f.weight": "final_norm_gamma.params",
+        "transformer.ln_f.bias": "final_norm_beta.params",
+        "lm_head.weight": "word_unembed.params",
+    }
+
+
+'''
+ # HF Falcon 40b model looks like this:
+
+ FalconForCausalLM(
+  (transformer): FalconModel(
+    (word_embeddings): Embedding(65024, 8192)
+    (h): ModuleList(
+      (0-59): 60 x FalconDecoderLayer(
+        (self_attention): FalconAttention(
+          (maybe_rotary): FalconRotaryEmbedding()
+          (query_key_value): FalconLinear(in_features=8192, out_features=9216, bias=False)
+          (dense): FalconLinear(in_features=8192, out_features=8192, bias=False)
+          (attention_dropout): Dropout(p=0.0, inplace=False)
+        )
+        (mlp): FalconMLP(
+          (dense_h_to_4h): FalconLinear(in_features=8192, out_features=32768, bias=False)
+          (act): GELU(approximate='none')
+          (dense_4h_to_h): FalconLinear(in_features=32768, out_features=8192, bias=False)
+        )
+        (ln_attn): LayerNorm((8192,), eps=1e-05, elementwise_affine=True)
+        (ln_mlp): LayerNorm((8192,), eps=1e-05, elementwise_affine=True)
+      )
+    )
+    (ln_f): LayerNorm((8192,), eps=1e-05, elementwise_affine=True)
+  )
+  (lm_head): Linear(in_features=8192, out_features=65024, bias=False)
+)
+'''
+
+
+class FalconNewArchTransformerContainer(LayerContainer):
+    """
+        Transformer layer container for the Falcon model.
+    """
+    qkv_w: GQAMegatronQKVParameter
+    attn_out_w: AttentionOutputParameter
+    mlp_1_w: MLP1Parameter
+    mlp_2_w: MLP2Parameter
+    ln_attn_gamma: NormParameter
+    ln_attn_beta: NormParameter
+    ln_mlp_gamma: NormParameter
+    ln_mlp_beta: NormParameter
+
+    PARAM_MAPPING = {
+        "self_attention.query_key_value.weight": "qkv_w.params",
+        "self_attention.dense.weight": "attn_out_w.params",
+        "mlp.dense_h_to_4h.weight": "mlp_1_w.params",
+        "mlp.dense_4h_to_h.weight": "mlp_2_w.params",
+        "ln_attn.weight": "ln_attn_gamma.params",
+        "ln_attn.bias": "ln_attn_beta.params",
+        "ln_mlp.weight": "ln_mlp_gamma.params",
+        "ln_mlp.bias": "ln_mlp_beta.params",
+    }