Skip to content

Commit

Permalink
update granite configs
Browse files Browse the repository at this point in the history
Signed-off-by: Yu Chin Fabian Lim <[email protected]>
  • Loading branch information
fabianlim committed Nov 11, 2024
1 parent 354513a commit 21af5fb
Show file tree
Hide file tree
Showing 5 changed files with 178 additions and 1 deletion.
23 changes: 22 additions & 1 deletion sample-configurations/CONTENTS.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ framework_configs:
- fused-ops-and-kernels
filename: foak-fast-kernels-sample-configuration.yaml

# moe configs
# ------- MOE CONFIGS ----------
- shortname: moe-scattermoe-granite-ep1
plugins:
- accelerated-moe
Expand All @@ -86,6 +86,13 @@ framework_configs:
- attention-and-distributed-packing
filename: moe-scattermoe-granite-ep1-padding-free-sample-configuration.yaml

- shortname: moe-scattermoe-granite-ep1-padding-free-foak
plugins:
- accelerated-moe
- attention-and-distributed-packing
- fused-ops-and-kernels
filename: moe-scattermoe-granite-ep1-padding-foak-free-sample-configuration.yaml

- shortname: moe-scattermoe-granite-ep2
plugins:
- accelerated-moe
Expand All @@ -97,6 +104,13 @@ framework_configs:
- attention-and-distributed-packing
filename: moe-scattermoe-granite-ep2-padding-free-sample-configuration.yaml

- shortname: moe-scattermoe-granite-ep2-padding-free-foak
plugins:
- accelerated-moe
- attention-and-distributed-packing
- fused-ops-and-kernels
filename: moe-scattermoe-granite-ep2-padding-foak-free-sample-configuration.yaml

- shortname: moe-scattermoe-granite-ep4
plugins:
- accelerated-moe
Expand All @@ -108,6 +122,13 @@ framework_configs:
- attention-and-distributed-packing
filename: moe-scattermoe-granite-ep4-padding-free-sample-configuration.yaml

- shortname: moe-scattermoe-granite-ep4-padding-free-foak
plugins:
- accelerated-moe
- attention-and-distributed-packing
- fused-ops-and-kernels
filename: moe-scattermoe-granite-ep4-padding-foak-free-sample-configuration.yaml

- shortname: moe-scattermoe-granite-ep8
plugins:
- accelerated-moe
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
# FMS Acceleration Plugin Configuration.
#
# Each stanza incorporates various configurations for
# different fine-tuning / training tasks.
plugins:
# Configurations to accelerate data packing/padding in training
training:

# attention module configurations
# e.g. padding-free modifications to attention layer
attention:

# this controls the confgurations for padding free computation of flash attention
padding_free:
method: huggingface
fused_ops_and_kernels:

# if under training stanza, then putting
# base_layer and fused_lora will be a misnomer
# - this should be in peft.quantized
# However, if it is specified, it will still
# be read. This is useful in use cases where
# the yaml is system generated and not shown
# to a user.

# activate various unsloth optimizations
# there are two versions of the plugin
# - the FastKernel version supports individual kernels
# - the FastQuantized version is all-or-nothing

# fast loss triton kernels
fast_loss: true

# fast rms norm triton kernels
fast_rms_layernorm: true

# fast RoPE embedding triton kernels
fast_rope_embeddings: true
moe:

# expert-parallel for MoE
scattermoe:

# The level of expert parallel sharding.
# - 1 means no sharding
# - if > 1, please ensure that this divides the world_size. This is because
# the devices will be replicated for every ep_degree devices, and
# the experts will be sharded within each group.
# - if > 1, also ensure that it divides the number of experts, as each device
# will then have num_of_experts / ep_degree experts.
ep_degree: 1
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
# FMS Acceleration Plugin Configuration.
#
# Each stanza incorporates various configurations for
# different fine-tuning / training tasks.
plugins:
# Configurations to accelerate data packing/padding in training
training:

# attention module configurations
# e.g. padding-free modifications to attention layer
attention:

# this controls the confgurations for padding free computation of flash attention
padding_free:
method: huggingface
fused_ops_and_kernels:

# if under training stanza, then putting
# base_layer and fused_lora will be a misnomer
# - this should be in peft.quantized
# However, if it is specified, it will still
# be read. This is useful in use cases where
# the yaml is system generated and not shown
# to a user.

# activate various unsloth optimizations
# there are two versions of the plugin
# - the FastKernel version supports individual kernels
# - the FastQuantized version is all-or-nothing

# fast loss triton kernels
fast_loss: true

# fast rms norm triton kernels
fast_rms_layernorm: true

# fast RoPE embedding triton kernels
fast_rope_embeddings: true
moe:

# expert-parallel for MoE
scattermoe:

# The level of expert parallel sharding.
# - 1 means no sharding
# - if > 1, please ensure that this divides the world_size. This is because
# the devices will be replicated for every ep_degree devices, and
# the experts will be sharded within each group.
# - if > 1, also ensure that it divides the number of experts, as each device
# will then have num_of_experts / ep_degree experts.
ep_degree: 2
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
# FMS Acceleration Plugin Configuration.
#
# Each stanza incorporates various configurations for
# different fine-tuning / training tasks.
plugins:
# Configurations to accelerate data packing/padding in training
training:

# attention module configurations
# e.g. padding-free modifications to attention layer
attention:

# this controls the confgurations for padding free computation of flash attention
padding_free:
method: huggingface
fused_ops_and_kernels:

# if under training stanza, then putting
# base_layer and fused_lora will be a misnomer
# - this should be in peft.quantized
# However, if it is specified, it will still
# be read. This is useful in use cases where
# the yaml is system generated and not shown
# to a user.

# activate various unsloth optimizations
# there are two versions of the plugin
# - the FastKernel version supports individual kernels
# - the FastQuantized version is all-or-nothing

# fast loss triton kernels
fast_loss: true

# fast rms norm triton kernels
fast_rms_layernorm: true

# fast RoPE embedding triton kernels
fast_rope_embeddings: true
moe:

# expert-parallel for MoE
scattermoe:

# The level of expert parallel sharding.
# - 1 means no sharding
# - if > 1, please ensure that this divides the world_size. This is because
# the devices will be replicated for every ep_degree devices, and
# the experts will be sharded within each group.
# - if > 1, also ensure that it divides the number of experts, as each device
# will then have num_of_experts / ep_degree experts.
ep_degree: 4
3 changes: 3 additions & 0 deletions scripts/generate_sample_configurations.py
Original file line number Diff line number Diff line change
Expand Up @@ -212,10 +212,13 @@ def read_configuration(path: str) -> Dict:
("foak-fast-kernels", (KEY_FAST_KERNELS,)),
("moe-scattermoe-granite-ep1", (KEY_SCATTERMOE_EP1,)),
("moe-scattermoe-granite-ep1-padding-free", (KEY_AADP_PADDING_FREE, KEY_SCATTERMOE_EP1,)),
("moe-scattermoe-granite-ep1-padding-free-foak", (KEY_AADP_PADDING_FREE, KEY_FAST_KERNELS, KEY_SCATTERMOE_EP1,)),
("moe-scattermoe-granite-ep2", (KEY_SCATTERMOE_EP2,)),
("moe-scattermoe-granite-ep2-padding-free", (KEY_AADP_PADDING_FREE, KEY_SCATTERMOE_EP2,)),
("moe-scattermoe-granite-ep2-padding-free-foak", (KEY_AADP_PADDING_FREE, KEY_FAST_KERNELS, KEY_SCATTERMOE_EP2,)),
("moe-scattermoe-granite-ep4", (KEY_SCATTERMOE_EP4,)),
("moe-scattermoe-granite-ep4-padding-free", (KEY_AADP_PADDING_FREE, KEY_SCATTERMOE_EP4,)),
("moe-scattermoe-granite-ep4-padding-free-foak", (KEY_AADP_PADDING_FREE, KEY_FAST_KERNELS, KEY_SCATTERMOE_EP4,)),
("moe-scattermoe-granite-ep8", (KEY_SCATTERMOE_EP8,)),
("moe-scattermoe-granite-ep8-foak", (KEY_FAST_KERNELS, KEY_SCATTERMOE_EP8,)),
]
Expand Down

0 comments on commit 21af5fb

Please sign in to comment.