diff --git a/plugins/accelerated-moe/README.md b/plugins/accelerated-moe/README.md
index 1d6af169..cb8e8d8e 100644
--- a/plugins/accelerated-moe/README.md
+++ b/plugins/accelerated-moe/README.md
@@ -44,10 +44,12 @@ Notes on code extraction:
 
 Run the below in the top-level directory of this repo:
 - the `scattermoe` dep is not included by default, so the `-x` switch installs it.
+- consider disabling the `torch` memory logging to see improved speeds.
 
 ```
 tox -e run-benches \
     -x testenv:run-benches.deps+="-r plugins/accelerated-moe/requirements-khd.txt" \
+    -x testenv:run-benches.setenv+="MEMORY_LOGGING=nividia" \
     -- \
     "1 2 4" 128 benchmark_outputs scenarios-moe.yaml accelerated-moe-scatter
 ```
diff --git a/sample-configurations/CONTENTS.yaml b/sample-configurations/CONTENTS.yaml
index 24bd6fcd..41bfb01e 100644
--- a/sample-configurations/CONTENTS.yaml
+++ b/sample-configurations/CONTENTS.yaml
@@ -113,8 +113,8 @@ framework_configs:
         - accelerated-moe
       filename: moe-scattermoe-granite-ep8-sample-configuration.yaml
 
-    - shortname: moe-scattermoe-granite-ep8-padding-free
+    - shortname: moe-scattermoe-granite-ep8-foak
       plugins:
         - accelerated-moe
-        - attention-and-distributed-packing
-      filename: moe-scattermoe-granite-ep8-padding-free-sample-configuration.yaml
\ No newline at end of file
+        - fused-ops-and-kernels
+      filename: moe-scattermoe-granite-ep8-foak-sample-configuration.yaml
\ No newline at end of file
diff --git a/sample-configurations/moe-scattermoe-granite-ep8-foak-sample-configuration.yaml b/sample-configurations/moe-scattermoe-granite-ep8-foak-sample-configuration.yaml
new file mode 100644
index 00000000..938c9024
--- /dev/null
+++ b/sample-configurations/moe-scattermoe-granite-ep8-foak-sample-configuration.yaml
@@ -0,0 +1,43 @@
+# FMS Acceleration Plugin Configuration. 
+#
+# Each stanza incorporates various configurations for 
+# different fine-tuning / training tasks.
+plugins:
+  training:
+
+    fused_ops_and_kernels:
+
+      # if under training stanza, then putting
+      # base_layer and fused_lora will be a misnomer
+      # - this should be in peft.quantized
+      # However, if it is specified, it will still 
+      # be read. This is useful in use cases where
+      # the yaml is system generated and not shown
+      # to a user.
+
+      # activate various unsloth optimizations
+      # there are two versions of the plugin
+      # - the FastKernel version supports individual kernels
+      # - the FastQuantized version is all-or-nothing
+
+      # fast loss triton kernels
+      fast_loss: true
+
+      # fast rms norm triton kernels
+      fast_rms_layernorm: true
+
+      # fast RoPE embedding triton kernels
+      fast_rope_embeddings: true
+    moe:
+
+      # expert-parallel for MoE
+      scattermoe:
+
+        # The level of expert parallel sharding. 
+        # - 1 means no sharding
+        # - if > 1, please ensure that this divides the world_size. This is because
+        #   the devices will be replicated for every ep_degree devices, and 
+        #   the experts will be sharded within each group.
+        # - if > 1, also ensure that it divides the number of experts, as each device
+        #   will then have num_of_experts / ep_degree experts.
+        ep_degree: 8
diff --git a/sample-configurations/moe-scattermoe-granite-ep8-padding-free-sample-configuration.yaml b/sample-configurations/moe-scattermoe-granite-ep8-padding-free-sample-configuration.yaml
deleted file mode 100644
index a29269fd..00000000
--- a/sample-configurations/moe-scattermoe-granite-ep8-padding-free-sample-configuration.yaml
+++ /dev/null
@@ -1,28 +0,0 @@
-# FMS Acceleration Plugin Configuration. 
-#
-# Each stanza incorporates various configurations for 
-# different fine-tuning / training tasks.
-plugins:
-  # Configurations to accelerate data packing/padding in training
-  training:
-
-    # attention module configurations
-    # e.g. padding-free modifications to attention layer
-    attention:
-
-      # this controls the confgurations for padding free computation of flash attention
-      padding_free:
-        method: huggingface
-    moe:
-
-      # expert-parallel for MoE
-      scattermoe:
-
-        # The level of expert parallel sharding. 
-        # - 1 means no sharding
-        # - if > 1, please ensure that this divides the world_size. This is because
-        #   the devices will be replicated for every ep_degree devices, and 
-        #   the experts will be sharded within each group.
-        # - if > 1, also ensure that it divides the number of experts, as each device
-        #   will then have num_of_experts / ep_degree experts.
-        ep_degree: 8
diff --git a/scripts/benchmarks/scenarios-moe.yaml b/scripts/benchmarks/scenarios-moe.yaml
index 3dbb996a..a1fd4c48 100644
--- a/scripts/benchmarks/scenarios-moe.yaml
+++ b/scripts/benchmarks/scenarios-moe.yaml
@@ -60,6 +60,7 @@ scenarios:
         framework_config: 
             - # without acceleration
             - moe-scattermoe-granite-ep8
+            - moe-scattermoe-granite-ep8-foak
         slow: True
         arguments:
             learning_rate: 5e-5
diff --git a/scripts/generate_sample_configurations.py b/scripts/generate_sample_configurations.py
index a30351a3..e4818450 100644
--- a/scripts/generate_sample_configurations.py
+++ b/scripts/generate_sample_configurations.py
@@ -217,7 +217,7 @@ def read_configuration(path: str) -> Dict:
     ("moe-scattermoe-granite-ep4", (KEY_SCATTERMOE_EP4,)),
     ("moe-scattermoe-granite-ep4-padding-free", (KEY_AADP_PADDING_FREE, KEY_SCATTERMOE_EP4,)),
     ("moe-scattermoe-granite-ep8", (KEY_SCATTERMOE_EP8,)),
-    ("moe-scattermoe-granite-ep8-padding-free", (KEY_AADP_PADDING_FREE, KEY_SCATTERMOE_EP8,)),
+    ("moe-scattermoe-granite-ep8-foak", (KEY_FAST_KERNELS, KEY_SCATTERMOE_EP8,)),
 ]
 
 
diff --git a/tox.ini b/tox.ini
index 4f35ab50..a62ae961 100644
--- a/tox.ini
+++ b/tox.ini
@@ -29,7 +29,7 @@ commands =
     # need a version of fms-hf-tuning that has integrated the framework
     # NOTE: have to install this first coz havnt merged
     # - this repo has a lot of pins, so we just install it first
-    pip install "fms-hf-tuning[flash-attn] @ git+https://github.com/foundation-model-stack/fms-hf-tuning.git@"{env:FHT_BRANCH:main}
+    pip install "fms-hf-tuning @ git+https://github.com/foundation-model-stack/fms-hf-tuning.git@"{env:FHT_BRANCH:main}
 
     # some models need this for tokenizers
     pip install protobuf
@@ -41,6 +41,9 @@ commands =
     python -m fms_acceleration.cli install -e {toxinidir}/plugins/attention-and-distributed-packing
     python -m fms_acceleration.cli install -e {toxinidir}/plugins/accelerated-moe
 
+    # install the flash attn at the last 
+    pip install flash-attn
+
     # run the benchmark script
     bash scripts/run_benchmarks.sh {posargs:"1 2" "4 8" benchmark_outputs}