diff --git a/plugins/accelerated-moe/README.md b/plugins/accelerated-moe/README.md index 1d6af169..cb8e8d8e 100644 --- a/plugins/accelerated-moe/README.md +++ b/plugins/accelerated-moe/README.md @@ -44,10 +44,12 @@ Notes on code extraction: Run the below in the top-level directory of this repo: - the `scattermoe` dep is not included by default, so the `-x` switch installs it. +- consider disabling the `torch` memory logging to see improved speeds. ``` tox -e run-benches \ -x testenv:run-benches.deps+="-r plugins/accelerated-moe/requirements-khd.txt" \ + -x testenv:run-benches.setenv+="MEMORY_LOGGING=nividia" \ -- \ "1 2 4" 128 benchmark_outputs scenarios-moe.yaml accelerated-moe-scatter ``` diff --git a/sample-configurations/CONTENTS.yaml b/sample-configurations/CONTENTS.yaml index 24bd6fcd..41bfb01e 100644 --- a/sample-configurations/CONTENTS.yaml +++ b/sample-configurations/CONTENTS.yaml @@ -113,8 +113,8 @@ framework_configs: - accelerated-moe filename: moe-scattermoe-granite-ep8-sample-configuration.yaml - - shortname: moe-scattermoe-granite-ep8-padding-free + - shortname: moe-scattermoe-granite-ep8-foak plugins: - accelerated-moe - - attention-and-distributed-packing - filename: moe-scattermoe-granite-ep8-padding-free-sample-configuration.yaml \ No newline at end of file + - fused-ops-and-kernels + filename: moe-scattermoe-granite-ep8-foak-sample-configuration.yaml \ No newline at end of file diff --git a/sample-configurations/moe-scattermoe-granite-ep8-foak-sample-configuration.yaml b/sample-configurations/moe-scattermoe-granite-ep8-foak-sample-configuration.yaml new file mode 100644 index 00000000..938c9024 --- /dev/null +++ b/sample-configurations/moe-scattermoe-granite-ep8-foak-sample-configuration.yaml @@ -0,0 +1,43 @@ +# FMS Acceleration Plugin Configuration. +# +# Each stanza incorporates various configurations for +# different fine-tuning / training tasks. +plugins: + training: + + fused_ops_and_kernels: + + # if under training stanza, then putting + # base_layer and fused_lora will be a misnomer + # - this should be in peft.quantized + # However, if it is specified, it will still + # be read. This is useful in use cases where + # the yaml is system generated and not shown + # to a user. + + # activate various unsloth optimizations + # there are two versions of the plugin + # - the FastKernel version supports individual kernels + # - the FastQuantized version is all-or-nothing + + # fast loss triton kernels + fast_loss: true + + # fast rms norm triton kernels + fast_rms_layernorm: true + + # fast RoPE embedding triton kernels + fast_rope_embeddings: true + moe: + + # expert-parallel for MoE + scattermoe: + + # The level of expert parallel sharding. + # - 1 means no sharding + # - if > 1, please ensure that this divides the world_size. This is because + # the devices will be replicated for every ep_degree devices, and + # the experts will be sharded within each group. + # - if > 1, also ensure that it divides the number of experts, as each device + # will then have num_of_experts / ep_degree experts. + ep_degree: 8 diff --git a/sample-configurations/moe-scattermoe-granite-ep8-padding-free-sample-configuration.yaml b/sample-configurations/moe-scattermoe-granite-ep8-padding-free-sample-configuration.yaml deleted file mode 100644 index a29269fd..00000000 --- a/sample-configurations/moe-scattermoe-granite-ep8-padding-free-sample-configuration.yaml +++ /dev/null @@ -1,28 +0,0 @@ -# FMS Acceleration Plugin Configuration. -# -# Each stanza incorporates various configurations for -# different fine-tuning / training tasks. -plugins: - # Configurations to accelerate data packing/padding in training - training: - - # attention module configurations - # e.g. padding-free modifications to attention layer - attention: - - # this controls the confgurations for padding free computation of flash attention - padding_free: - method: huggingface - moe: - - # expert-parallel for MoE - scattermoe: - - # The level of expert parallel sharding. - # - 1 means no sharding - # - if > 1, please ensure that this divides the world_size. This is because - # the devices will be replicated for every ep_degree devices, and - # the experts will be sharded within each group. - # - if > 1, also ensure that it divides the number of experts, as each device - # will then have num_of_experts / ep_degree experts. - ep_degree: 8 diff --git a/scripts/benchmarks/scenarios-moe.yaml b/scripts/benchmarks/scenarios-moe.yaml index 3dbb996a..a1fd4c48 100644 --- a/scripts/benchmarks/scenarios-moe.yaml +++ b/scripts/benchmarks/scenarios-moe.yaml @@ -60,6 +60,7 @@ scenarios: framework_config: - # without acceleration - moe-scattermoe-granite-ep8 + - moe-scattermoe-granite-ep8-foak slow: True arguments: learning_rate: 5e-5 diff --git a/scripts/generate_sample_configurations.py b/scripts/generate_sample_configurations.py index a30351a3..e4818450 100644 --- a/scripts/generate_sample_configurations.py +++ b/scripts/generate_sample_configurations.py @@ -217,7 +217,7 @@ def read_configuration(path: str) -> Dict: ("moe-scattermoe-granite-ep4", (KEY_SCATTERMOE_EP4,)), ("moe-scattermoe-granite-ep4-padding-free", (KEY_AADP_PADDING_FREE, KEY_SCATTERMOE_EP4,)), ("moe-scattermoe-granite-ep8", (KEY_SCATTERMOE_EP8,)), - ("moe-scattermoe-granite-ep8-padding-free", (KEY_AADP_PADDING_FREE, KEY_SCATTERMOE_EP8,)), + ("moe-scattermoe-granite-ep8-foak", (KEY_FAST_KERNELS, KEY_SCATTERMOE_EP8,)), ] diff --git a/tox.ini b/tox.ini index 4f35ab50..a62ae961 100644 --- a/tox.ini +++ b/tox.ini @@ -29,7 +29,7 @@ commands = # need a version of fms-hf-tuning that has integrated the framework # NOTE: have to install this first coz havnt merged # - this repo has a lot of pins, so we just install it first - pip install "fms-hf-tuning[flash-attn] @ git+https://github.com/foundation-model-stack/fms-hf-tuning.git@"{env:FHT_BRANCH:main} + pip install "fms-hf-tuning @ git+https://github.com/foundation-model-stack/fms-hf-tuning.git@"{env:FHT_BRANCH:main} # some models need this for tokenizers pip install protobuf @@ -41,6 +41,9 @@ commands = python -m fms_acceleration.cli install -e {toxinidir}/plugins/attention-and-distributed-packing python -m fms_acceleration.cli install -e {toxinidir}/plugins/accelerated-moe + # install the flash attn at the last + pip install flash-attn + # run the benchmark script bash scripts/run_benchmarks.sh {posargs:"1 2" "4 8" benchmark_outputs}