From 21af5fb9f2989b3dbf443c016e4c0470b536a593 Mon Sep 17 00:00:00 2001 From: Yu Chin Fabian Lim Date: Mon, 11 Nov 2024 13:47:39 +0000 Subject: [PATCH] update granite configs Signed-off-by: Yu Chin Fabian Lim --- sample-configurations/CONTENTS.yaml | 23 ++++++++- ...adding-free-foak-sample-configuration.yaml | 51 +++++++++++++++++++ ...adding-free-foak-sample-configuration.yaml | 51 +++++++++++++++++++ ...adding-free-foak-sample-configuration.yaml | 51 +++++++++++++++++++ scripts/generate_sample_configurations.py | 3 ++ 5 files changed, 178 insertions(+), 1 deletion(-) create mode 100644 sample-configurations/moe-scattermoe-granite-ep1-padding-free-foak-sample-configuration.yaml create mode 100644 sample-configurations/moe-scattermoe-granite-ep2-padding-free-foak-sample-configuration.yaml create mode 100644 sample-configurations/moe-scattermoe-granite-ep4-padding-free-foak-sample-configuration.yaml diff --git a/sample-configurations/CONTENTS.yaml b/sample-configurations/CONTENTS.yaml index 41bfb01e..dd393633 100644 --- a/sample-configurations/CONTENTS.yaml +++ b/sample-configurations/CONTENTS.yaml @@ -74,7 +74,7 @@ framework_configs: - fused-ops-and-kernels filename: foak-fast-kernels-sample-configuration.yaml - # moe configs + # ------- MOE CONFIGS ---------- - shortname: moe-scattermoe-granite-ep1 plugins: - accelerated-moe @@ -86,6 +86,13 @@ framework_configs: - attention-and-distributed-packing filename: moe-scattermoe-granite-ep1-padding-free-sample-configuration.yaml + - shortname: moe-scattermoe-granite-ep1-padding-free-foak + plugins: + - accelerated-moe + - attention-and-distributed-packing + - fused-ops-and-kernels + filename: moe-scattermoe-granite-ep1-padding-foak-free-sample-configuration.yaml + - shortname: moe-scattermoe-granite-ep2 plugins: - accelerated-moe @@ -97,6 +104,13 @@ framework_configs: - attention-and-distributed-packing filename: moe-scattermoe-granite-ep2-padding-free-sample-configuration.yaml + - shortname: moe-scattermoe-granite-ep2-padding-free-foak + plugins: + - accelerated-moe + - attention-and-distributed-packing + - fused-ops-and-kernels + filename: moe-scattermoe-granite-ep2-padding-foak-free-sample-configuration.yaml + - shortname: moe-scattermoe-granite-ep4 plugins: - accelerated-moe @@ -108,6 +122,13 @@ framework_configs: - attention-and-distributed-packing filename: moe-scattermoe-granite-ep4-padding-free-sample-configuration.yaml + - shortname: moe-scattermoe-granite-ep4-padding-free-foak + plugins: + - accelerated-moe + - attention-and-distributed-packing + - fused-ops-and-kernels + filename: moe-scattermoe-granite-ep4-padding-foak-free-sample-configuration.yaml + - shortname: moe-scattermoe-granite-ep8 plugins: - accelerated-moe diff --git a/sample-configurations/moe-scattermoe-granite-ep1-padding-free-foak-sample-configuration.yaml b/sample-configurations/moe-scattermoe-granite-ep1-padding-free-foak-sample-configuration.yaml new file mode 100644 index 00000000..881ef14b --- /dev/null +++ b/sample-configurations/moe-scattermoe-granite-ep1-padding-free-foak-sample-configuration.yaml @@ -0,0 +1,51 @@ +# FMS Acceleration Plugin Configuration. +# +# Each stanza incorporates various configurations for +# different fine-tuning / training tasks. +plugins: + # Configurations to accelerate data packing/padding in training + training: + + # attention module configurations + # e.g. padding-free modifications to attention layer + attention: + + # this controls the confgurations for padding free computation of flash attention + padding_free: + method: huggingface + fused_ops_and_kernels: + + # if under training stanza, then putting + # base_layer and fused_lora will be a misnomer + # - this should be in peft.quantized + # However, if it is specified, it will still + # be read. This is useful in use cases where + # the yaml is system generated and not shown + # to a user. + + # activate various unsloth optimizations + # there are two versions of the plugin + # - the FastKernel version supports individual kernels + # - the FastQuantized version is all-or-nothing + + # fast loss triton kernels + fast_loss: true + + # fast rms norm triton kernels + fast_rms_layernorm: true + + # fast RoPE embedding triton kernels + fast_rope_embeddings: true + moe: + + # expert-parallel for MoE + scattermoe: + + # The level of expert parallel sharding. + # - 1 means no sharding + # - if > 1, please ensure that this divides the world_size. This is because + # the devices will be replicated for every ep_degree devices, and + # the experts will be sharded within each group. + # - if > 1, also ensure that it divides the number of experts, as each device + # will then have num_of_experts / ep_degree experts. + ep_degree: 1 diff --git a/sample-configurations/moe-scattermoe-granite-ep2-padding-free-foak-sample-configuration.yaml b/sample-configurations/moe-scattermoe-granite-ep2-padding-free-foak-sample-configuration.yaml new file mode 100644 index 00000000..b3c7712d --- /dev/null +++ b/sample-configurations/moe-scattermoe-granite-ep2-padding-free-foak-sample-configuration.yaml @@ -0,0 +1,51 @@ +# FMS Acceleration Plugin Configuration. +# +# Each stanza incorporates various configurations for +# different fine-tuning / training tasks. +plugins: + # Configurations to accelerate data packing/padding in training + training: + + # attention module configurations + # e.g. padding-free modifications to attention layer + attention: + + # this controls the confgurations for padding free computation of flash attention + padding_free: + method: huggingface + fused_ops_and_kernels: + + # if under training stanza, then putting + # base_layer and fused_lora will be a misnomer + # - this should be in peft.quantized + # However, if it is specified, it will still + # be read. This is useful in use cases where + # the yaml is system generated and not shown + # to a user. + + # activate various unsloth optimizations + # there are two versions of the plugin + # - the FastKernel version supports individual kernels + # - the FastQuantized version is all-or-nothing + + # fast loss triton kernels + fast_loss: true + + # fast rms norm triton kernels + fast_rms_layernorm: true + + # fast RoPE embedding triton kernels + fast_rope_embeddings: true + moe: + + # expert-parallel for MoE + scattermoe: + + # The level of expert parallel sharding. + # - 1 means no sharding + # - if > 1, please ensure that this divides the world_size. This is because + # the devices will be replicated for every ep_degree devices, and + # the experts will be sharded within each group. + # - if > 1, also ensure that it divides the number of experts, as each device + # will then have num_of_experts / ep_degree experts. + ep_degree: 2 diff --git a/sample-configurations/moe-scattermoe-granite-ep4-padding-free-foak-sample-configuration.yaml b/sample-configurations/moe-scattermoe-granite-ep4-padding-free-foak-sample-configuration.yaml new file mode 100644 index 00000000..c73917ce --- /dev/null +++ b/sample-configurations/moe-scattermoe-granite-ep4-padding-free-foak-sample-configuration.yaml @@ -0,0 +1,51 @@ +# FMS Acceleration Plugin Configuration. +# +# Each stanza incorporates various configurations for +# different fine-tuning / training tasks. +plugins: + # Configurations to accelerate data packing/padding in training + training: + + # attention module configurations + # e.g. padding-free modifications to attention layer + attention: + + # this controls the confgurations for padding free computation of flash attention + padding_free: + method: huggingface + fused_ops_and_kernels: + + # if under training stanza, then putting + # base_layer and fused_lora will be a misnomer + # - this should be in peft.quantized + # However, if it is specified, it will still + # be read. This is useful in use cases where + # the yaml is system generated and not shown + # to a user. + + # activate various unsloth optimizations + # there are two versions of the plugin + # - the FastKernel version supports individual kernels + # - the FastQuantized version is all-or-nothing + + # fast loss triton kernels + fast_loss: true + + # fast rms norm triton kernels + fast_rms_layernorm: true + + # fast RoPE embedding triton kernels + fast_rope_embeddings: true + moe: + + # expert-parallel for MoE + scattermoe: + + # The level of expert parallel sharding. + # - 1 means no sharding + # - if > 1, please ensure that this divides the world_size. This is because + # the devices will be replicated for every ep_degree devices, and + # the experts will be sharded within each group. + # - if > 1, also ensure that it divides the number of experts, as each device + # will then have num_of_experts / ep_degree experts. + ep_degree: 4 diff --git a/scripts/generate_sample_configurations.py b/scripts/generate_sample_configurations.py index e4818450..ff775c8e 100644 --- a/scripts/generate_sample_configurations.py +++ b/scripts/generate_sample_configurations.py @@ -212,10 +212,13 @@ def read_configuration(path: str) -> Dict: ("foak-fast-kernels", (KEY_FAST_KERNELS,)), ("moe-scattermoe-granite-ep1", (KEY_SCATTERMOE_EP1,)), ("moe-scattermoe-granite-ep1-padding-free", (KEY_AADP_PADDING_FREE, KEY_SCATTERMOE_EP1,)), + ("moe-scattermoe-granite-ep1-padding-free-foak", (KEY_AADP_PADDING_FREE, KEY_FAST_KERNELS, KEY_SCATTERMOE_EP1,)), ("moe-scattermoe-granite-ep2", (KEY_SCATTERMOE_EP2,)), ("moe-scattermoe-granite-ep2-padding-free", (KEY_AADP_PADDING_FREE, KEY_SCATTERMOE_EP2,)), + ("moe-scattermoe-granite-ep2-padding-free-foak", (KEY_AADP_PADDING_FREE, KEY_FAST_KERNELS, KEY_SCATTERMOE_EP2,)), ("moe-scattermoe-granite-ep4", (KEY_SCATTERMOE_EP4,)), ("moe-scattermoe-granite-ep4-padding-free", (KEY_AADP_PADDING_FREE, KEY_SCATTERMOE_EP4,)), + ("moe-scattermoe-granite-ep4-padding-free-foak", (KEY_AADP_PADDING_FREE, KEY_FAST_KERNELS, KEY_SCATTERMOE_EP4,)), ("moe-scattermoe-granite-ep8", (KEY_SCATTERMOE_EP8,)), ("moe-scattermoe-granite-ep8-foak", (KEY_FAST_KERNELS, KEY_SCATTERMOE_EP8,)), ]