update granite configs

Signed-off-by: Yu Chin Fabian Lim <[email protected]>
foundation-model-stack · Nov 11, 2024 · 21af5fb · 21af5fb
1 parent 354513a
commit 21af5fb
Show file tree

Hide file tree

Showing 5 changed files with 178 additions and 1 deletion.
diff --git a/sample-configurations/CONTENTS.yaml b/sample-configurations/CONTENTS.yaml
@@ -74,7 +74,7 @@ framework_configs:
         - fused-ops-and-kernels
       filename: foak-fast-kernels-sample-configuration.yaml
 
-    # moe configs
+    # -------  MOE CONFIGS ----------
     - shortname: moe-scattermoe-granite-ep1
       plugins:
         - accelerated-moe
@@ -86,6 +86,13 @@ framework_configs:
         - attention-and-distributed-packing
       filename: moe-scattermoe-granite-ep1-padding-free-sample-configuration.yaml
 
+    - shortname: moe-scattermoe-granite-ep1-padding-free-foak
+      plugins:
+        - accelerated-moe
+        - attention-and-distributed-packing
+        - fused-ops-and-kernels
+      filename: moe-scattermoe-granite-ep1-padding-foak-free-sample-configuration.yaml
+
     - shortname: moe-scattermoe-granite-ep2
       plugins:
         - accelerated-moe
@@ -97,6 +104,13 @@ framework_configs:
         - attention-and-distributed-packing
       filename: moe-scattermoe-granite-ep2-padding-free-sample-configuration.yaml
 
+    - shortname: moe-scattermoe-granite-ep2-padding-free-foak
+      plugins:
+        - accelerated-moe
+        - attention-and-distributed-packing
+        - fused-ops-and-kernels
+      filename: moe-scattermoe-granite-ep2-padding-foak-free-sample-configuration.yaml
+
     - shortname: moe-scattermoe-granite-ep4
       plugins:
         - accelerated-moe
@@ -108,6 +122,13 @@ framework_configs:
         - attention-and-distributed-packing
       filename: moe-scattermoe-granite-ep4-padding-free-sample-configuration.yaml
 
+    - shortname: moe-scattermoe-granite-ep4-padding-free-foak
+      plugins:
+        - accelerated-moe
+        - attention-and-distributed-packing
+        - fused-ops-and-kernels
+      filename: moe-scattermoe-granite-ep4-padding-foak-free-sample-configuration.yaml
+
     - shortname: moe-scattermoe-granite-ep8
       plugins:
         - accelerated-moe

diff --git a/sample-configurations/moe-scattermoe-granite-ep1-padding-free-foak-sample-configuration.yaml b/sample-configurations/moe-scattermoe-granite-ep1-padding-free-foak-sample-configuration.yaml
@@ -0,0 +1,51 @@
+# FMS Acceleration Plugin Configuration. 
+#
+# Each stanza incorporates various configurations for 
+# different fine-tuning / training tasks.
+plugins:
+  # Configurations to accelerate data packing/padding in training
+  training:
+
+    # attention module configurations
+    # e.g. padding-free modifications to attention layer
+    attention:
+
+      # this controls the confgurations for padding free computation of flash attention
+      padding_free:
+        method: huggingface
+    fused_ops_and_kernels:
+
+      # if under training stanza, then putting
+      # base_layer and fused_lora will be a misnomer
+      # - this should be in peft.quantized
+      # However, if it is specified, it will still 
+      # be read. This is useful in use cases where
+      # the yaml is system generated and not shown
+      # to a user.
+
+      # activate various unsloth optimizations
+      # there are two versions of the plugin
+      # - the FastKernel version supports individual kernels
+      # - the FastQuantized version is all-or-nothing
+
+      # fast loss triton kernels
+      fast_loss: true
+
+      # fast rms norm triton kernels
+      fast_rms_layernorm: true
+
+      # fast RoPE embedding triton kernels
+      fast_rope_embeddings: true
+    moe:
+
+      # expert-parallel for MoE
+      scattermoe:
+
+        # The level of expert parallel sharding. 
+        # - 1 means no sharding
+        # - if > 1, please ensure that this divides the world_size. This is because
+        #   the devices will be replicated for every ep_degree devices, and 
+        #   the experts will be sharded within each group.
+        # - if > 1, also ensure that it divides the number of experts, as each device
+        #   will then have num_of_experts / ep_degree experts.
+        ep_degree: 1
diff --git a/sample-configurations/moe-scattermoe-granite-ep2-padding-free-foak-sample-configuration.yaml b/sample-configurations/moe-scattermoe-granite-ep2-padding-free-foak-sample-configuration.yaml
@@ -0,0 +1,51 @@
+# FMS Acceleration Plugin Configuration. 
+#
+# Each stanza incorporates various configurations for 
+# different fine-tuning / training tasks.
+plugins:
+  # Configurations to accelerate data packing/padding in training
+  training:
+
+    # attention module configurations
+    # e.g. padding-free modifications to attention layer
+    attention:
+
+      # this controls the confgurations for padding free computation of flash attention
+      padding_free:
+        method: huggingface
+    fused_ops_and_kernels:
+
+      # if under training stanza, then putting
+      # base_layer and fused_lora will be a misnomer
+      # - this should be in peft.quantized
+      # However, if it is specified, it will still 
+      # be read. This is useful in use cases where
+      # the yaml is system generated and not shown
+      # to a user.
+
+      # activate various unsloth optimizations
+      # there are two versions of the plugin
+      # - the FastKernel version supports individual kernels
+      # - the FastQuantized version is all-or-nothing
+
+      # fast loss triton kernels
+      fast_loss: true
+
+      # fast rms norm triton kernels
+      fast_rms_layernorm: true
+
+      # fast RoPE embedding triton kernels
+      fast_rope_embeddings: true
+    moe:
+
+      # expert-parallel for MoE
+      scattermoe:
+
+        # The level of expert parallel sharding. 
+        # - 1 means no sharding
+        # - if > 1, please ensure that this divides the world_size. This is because
+        #   the devices will be replicated for every ep_degree devices, and 
+        #   the experts will be sharded within each group.
+        # - if > 1, also ensure that it divides the number of experts, as each device
+        #   will then have num_of_experts / ep_degree experts.
+        ep_degree: 2
diff --git a/sample-configurations/moe-scattermoe-granite-ep4-padding-free-foak-sample-configuration.yaml b/sample-configurations/moe-scattermoe-granite-ep4-padding-free-foak-sample-configuration.yaml
@@ -0,0 +1,51 @@
+# FMS Acceleration Plugin Configuration. 
+#
+# Each stanza incorporates various configurations for 
+# different fine-tuning / training tasks.
+plugins:
+  # Configurations to accelerate data packing/padding in training
+  training:
+
+    # attention module configurations
+    # e.g. padding-free modifications to attention layer
+    attention:
+
+      # this controls the confgurations for padding free computation of flash attention
+      padding_free:
+        method: huggingface
+    fused_ops_and_kernels:
+
+      # if under training stanza, then putting
+      # base_layer and fused_lora will be a misnomer
+      # - this should be in peft.quantized
+      # However, if it is specified, it will still 
+      # be read. This is useful in use cases where
+      # the yaml is system generated and not shown
+      # to a user.
+
+      # activate various unsloth optimizations
+      # there are two versions of the plugin
+      # - the FastKernel version supports individual kernels
+      # - the FastQuantized version is all-or-nothing
+
+      # fast loss triton kernels
+      fast_loss: true
+
+      # fast rms norm triton kernels
+      fast_rms_layernorm: true
+
+      # fast RoPE embedding triton kernels
+      fast_rope_embeddings: true
+    moe:
+
+      # expert-parallel for MoE
+      scattermoe:
+
+        # The level of expert parallel sharding. 
+        # - 1 means no sharding
+        # - if > 1, please ensure that this divides the world_size. This is because
+        #   the devices will be replicated for every ep_degree devices, and 
+        #   the experts will be sharded within each group.
+        # - if > 1, also ensure that it divides the number of experts, as each device
+        #   will then have num_of_experts / ep_degree experts.
+        ep_degree: 4
diff --git a/scripts/generate_sample_configurations.py b/scripts/generate_sample_configurations.py
@@ -212,10 +212,13 @@ def read_configuration(path: str) -> Dict:
     ("foak-fast-kernels", (KEY_FAST_KERNELS,)),
     ("moe-scattermoe-granite-ep1", (KEY_SCATTERMOE_EP1,)),
     ("moe-scattermoe-granite-ep1-padding-free", (KEY_AADP_PADDING_FREE, KEY_SCATTERMOE_EP1,)),
+    ("moe-scattermoe-granite-ep1-padding-free-foak", (KEY_AADP_PADDING_FREE, KEY_FAST_KERNELS, KEY_SCATTERMOE_EP1,)),
     ("moe-scattermoe-granite-ep2", (KEY_SCATTERMOE_EP2,)),
     ("moe-scattermoe-granite-ep2-padding-free", (KEY_AADP_PADDING_FREE, KEY_SCATTERMOE_EP2,)),
+    ("moe-scattermoe-granite-ep2-padding-free-foak", (KEY_AADP_PADDING_FREE, KEY_FAST_KERNELS, KEY_SCATTERMOE_EP2,)),
     ("moe-scattermoe-granite-ep4", (KEY_SCATTERMOE_EP4,)),
     ("moe-scattermoe-granite-ep4-padding-free", (KEY_AADP_PADDING_FREE, KEY_SCATTERMOE_EP4,)),
+    ("moe-scattermoe-granite-ep4-padding-free-foak", (KEY_AADP_PADDING_FREE, KEY_FAST_KERNELS, KEY_SCATTERMOE_EP4,)),
     ("moe-scattermoe-granite-ep8", (KEY_SCATTERMOE_EP8,)),
     ("moe-scattermoe-granite-ep8-foak", (KEY_FAST_KERNELS, KEY_SCATTERMOE_EP8,)),
 ]