diff --git a/.ci/scripts/gather_test_models.py b/.ci/scripts/gather_test_models.py
index 87ed31af3d..d02213b9fa 100755
--- a/.ci/scripts/gather_test_models.py
+++ b/.ci/scripts/gather_test_models.py
@@ -20,16 +20,16 @@
 CUSTOM_RUNNERS = {
     "linux": {
         # This one runs OOM on smaller runner, the root cause is unclear (T163016365)
-        "w2l": "linux.12xlarge",
-        "ic4": "linux.12xlarge",
-        "resnet50": "linux.12xlarge",
-        "llava": "linux.12xlarge",
-        "llama3_2_vision_encoder": "linux.12xlarge",
-        # "llama3_2_text_decoder": "linux.12xlarge",  # TODO: re-enable test when Huy's change is in / model gets smaller.
+        "w2l": "linux.4xlarge.memory",
+        "ic4": "linux.4xlarge.memory",
+        "resnet50": "linux.4xlarge.memory",
+        "llava": "linux.4xlarge.memory",
+        "llama3_2_vision_encoder": "linux.4xlarge.memory",
+        "llama3_2_text_decoder": "linux.4xlarge.memory",
         # This one causes timeout on smaller runner, the root cause is unclear (T161064121)
-        "dl3": "linux.12xlarge",
-        "emformer_join": "linux.12xlarge",
-        "emformer_predict": "linux.12xlarge",
+        "dl3": "linux.4xlarge.memory",
+        "emformer_join": "linux.4xlarge.memory",
+        "emformer_predict": "linux.4xlarge.memory",
     }
 }
 
@@ -39,10 +39,12 @@
     "linux": {
         "mobilebert": 90,
         "emformer_predict": 360,
+        "llama3_2_text_decoder": 360,
     },
     "macos": {
         "mobilebert": 90,
         "emformer_predict": 360,
+        "llama3_2_text_decoder": 360,
     },
 }
 
diff --git a/.ci/scripts/setup-macos.sh b/.ci/scripts/setup-macos.sh
index 833ba0aafe..b1a8ff14b5 100755
--- a/.ci/scripts/setup-macos.sh
+++ b/.ci/scripts/setup-macos.sh
@@ -49,6 +49,9 @@ install_buck() {
 
   rm "${BUCK2}"
   popd
+
+  # Kill all running buck2 daemon for a fresh start
+  buck2 killall || true
 }
 
 function write_sccache_stub() {
diff --git a/.ci/scripts/test_llama.sh b/.ci/scripts/test_llama.sh
index e109845547..5e5ed588a2 100644
--- a/.ci/scripts/test_llama.sh
+++ b/.ci/scripts/test_llama.sh
@@ -51,6 +51,9 @@ UPLOAD_DIR="${UPLOAD_DIR:-}"
 # Default PT2E_QUANTIZE to empty string if not set
 PT2E_QUANTIZE="${PT2E_QUANTIZE:-}"
 
+# Default CMake Build Type to release mode
+CMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE:-Release}
+
 if [[ $# -lt 4 ]]; then # Assuming 4 mandatory args
     echo "Expecting atleast 4 positional arguments"
     echo "Usage: [...]"
@@ -143,7 +146,7 @@ cmake_install_executorch_libraries() {
     rm -rf cmake-out
     retry cmake \
         -DCMAKE_INSTALL_PREFIX=cmake-out \
-        -DCMAKE_BUILD_TYPE=Debug \
+        -DCMAKE_BUILD_TYPE="$CMAKE_BUILD_TYPE" \
         -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
         -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
         -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
@@ -157,7 +160,7 @@ cmake_install_executorch_libraries() {
         -DQNN_SDK_ROOT="$QNN_SDK_ROOT" \
         -DPYTHON_EXECUTABLE="$PYTHON_EXECUTABLE" \
         -Bcmake-out .
-    cmake --build cmake-out -j9 --target install --config Debug
+    cmake --build cmake-out -j9 --target install --config "$CMAKE_BUILD_TYPE"
 }
 
 cmake_build_llama_runner() {
@@ -165,14 +168,14 @@ cmake_build_llama_runner() {
     dir="examples/models/llama"
     retry cmake \
         -DCMAKE_INSTALL_PREFIX=cmake-out \
-        -DCMAKE_BUILD_TYPE=Debug \
+        -DCMAKE_BUILD_TYPE="$CMAKE_BUILD_TYPE" \
         -DEXECUTORCH_BUILD_KERNELS_CUSTOM="$CUSTOM" \
         -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
         -DEXECUTORCH_BUILD_XNNPACK="$XNNPACK" \
         -DPYTHON_EXECUTABLE="$PYTHON_EXECUTABLE" \
         -Bcmake-out/${dir} \
         ${dir}
-    cmake --build cmake-out/${dir} -j9 --config Debug
+    cmake --build cmake-out/${dir} -j9 --config "$CMAKE_BUILD_TYPE"
 
 }
 
diff --git a/.ci/scripts/test_llava.sh b/.ci/scripts/test_llava.sh
index 1057fa8f4a..a30143d895 100644
--- a/.ci/scripts/test_llava.sh
+++ b/.ci/scripts/test_llava.sh
@@ -8,11 +8,11 @@
 set -exu
 # shellcheck source=/dev/null
 
-BUILD_TYPE=${1:-Debug}
 TARGET_OS=${2:-Native}
 BUILD_DIR=${3:-cmake-out}
+CMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE:-Release}
 
-echo "Building with BUILD_TYPE: $BUILD_TYPE, TARGET_OS: $TARGET_OS, BUILD_DIR: $BUILD_DIR"
+echo "Building with CMAKE_BUILD_TYPE: $CMAKE_BUILD_TYPE, TARGET_OS: $TARGET_OS, BUILD_DIR: $BUILD_DIR"
 
 if [[ -z "${PYTHON_EXECUTABLE:-}" ]]; then
     PYTHON_EXECUTABLE=python3
@@ -32,7 +32,7 @@ if hash nproc &> /dev/null; then NPROC=$(nproc); fi
 
 EXECUTORCH_COMMON_CMAKE_ARGS="                      \
         -DCMAKE_INSTALL_PREFIX=${BUILD_DIR}         \
-        -DCMAKE_BUILD_TYPE=${BUILD_TYPE}            \
+        -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}            \
         -DEXECUTORCH_ENABLE_LOGGING=ON              \
         -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON      \
         -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
@@ -49,7 +49,7 @@ cmake_install_executorch_libraries() {
         ${EXECUTORCH_COMMON_CMAKE_ARGS} \
         -B${BUILD_DIR} .
 
-    cmake --build ${BUILD_DIR} -j${NPROC} --target install --config ${BUILD_TYPE}
+    cmake --build ${BUILD_DIR} -j${NPROC} --target install --config ${CMAKE_BUILD_TYPE}
 }
 
 cmake_install_executorch_libraries_for_android() {
@@ -59,14 +59,14 @@ cmake_install_executorch_libraries_for_android() {
         ${EXECUTORCH_COMMON_CMAKE_ARGS}                                         \
         -B${BUILD_DIR} .
 
-    cmake --build ${BUILD_DIR} -j${NPROC} --target install --config ${BUILD_TYPE}
+    cmake --build ${BUILD_DIR} -j${NPROC} --target install --config ${CMAKE_BUILD_TYPE}
 }
 
 
 LLAVA_COMMON_CMAKE_ARGS="                        \
         -DPYTHON_EXECUTABLE="$PYTHON_EXECUTABLE" \
         -DCMAKE_INSTALL_PREFIX=${BUILD_DIR}      \
-        -DCMAKE_BUILD_TYPE=${BUILD_TYPE}         \
+        -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}         \
         -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON     \
         -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON  \
         -DEXECUTORCH_BUILD_XNNPACK=ON"
@@ -81,7 +81,7 @@ cmake_build_llava_runner() {
         -B${BUILD_DIR}/${dir}             \
         ${dir}
 
-    cmake --build ${BUILD_DIR}/${dir} -j${NPROC} --config ${BUILD_TYPE}
+    cmake --build ${BUILD_DIR}/${dir} -j${NPROC} --config ${CMAKE_BUILD_TYPE}
 }
 
 
@@ -98,7 +98,7 @@ cmake_build_llava_runner_for_android() {
         -B${BUILD_DIR}/${dir}                                                   \
         ${dir}
 
-    cmake --build ${BUILD_DIR}/${dir} -j${NPROC} --config ${BUILD_TYPE}
+    cmake --build ${BUILD_DIR}/${dir} -j${NPROC} --config ${CMAKE_BUILD_TYPE}
 }
 
 # only export the one without custom op for now since it's
diff --git a/.github/workflows/apple.yml b/.github/workflows/apple.yml
index 3a07c6d394..f284d466bf 100644
--- a/.github/workflows/apple.yml
+++ b/.github/workflows/apple.yml
@@ -42,6 +42,8 @@ jobs:
 
   build-demo-ios:
     name: build-demo-ios
+    # NB: Don't run this on fork PRs because they won't have access to the secret and would fail anyway
+    if: ${{ !github.event.pull_request.head.repo.fork }}
     uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
     secrets: inherit
     with:
@@ -190,6 +192,8 @@ jobs:
         ) done
 
   upload-frameworks-ios:
+    # NB: Don't run this on fork PRs because they won't have access to the secret and would fail anyway
+    if: ${{ !github.event.pull_request.head.repo.fork }}
     runs-on: ubuntu-22.04
     needs: [build-frameworks-ios, set-version]
     timeout-minutes: 30
@@ -278,6 +282,8 @@ jobs:
 
   build-benchmark-app:
     name: build-benchmark-app
+    # NB: Don't run this on fork PRs because they won't have access to the secret and would fail anyway
+    if: ${{ !github.event.pull_request.head.repo.fork }}
     uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
     secrets: inherit
     with:
diff --git a/.github/workflows/ghstack_land.yml b/.github/workflows/ghstack_land.yml
index e3b02d2a94..09bd2a7ced 100644
--- a/.github/workflows/ghstack_land.yml
+++ b/.github/workflows/ghstack_land.yml
@@ -3,21 +3,7 @@ on:
   pull_request:
     types: [closed]
     branches:
-      - 'gh/cccclai/[0-9]+/base'
-      - 'gh/dbort/[0-9]+/base'
-      - 'gh/dvorjackz/[0-9]+/base'
-      - 'gh/guangy10/[0-9]+/base'
-      - 'gh/helunwencser/[0-9]+/base'
-      - 'gh/jorgep31415/[0-9]+/base'
-      - 'gh/kimishpatel/[0-9]+/base'
-      - 'gh/kirklandsign/[0-9]+/base'
-      - 'gh/larryliu0820/[0-9]+/base'
-      - 'gh/lucylq/[0-9]+/base'
-      - 'gh/manuelcandales/[0-9]+/base'
-      - 'gh/mcr229/[0-9]+/base'
-      - 'gh/swolchok/[0-9]+/base'
-      - 'gh/SS-JIA/[0-9]+/base'
-      - 'gh/trivedivivek/[0-9]+/base'
+      - 'gh/*/[0-9]+/base'
 
 jobs:
   ghstack_merge_to_main:
diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
index 88cd8ff15a..6d7205611e 100644
--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@@ -332,7 +332,7 @@ jobs:
       docker-image: executorch-ubuntu-22.04-clang12
 
   unittest-arm:
-    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     with:
       runner: linux.2xlarge
       docker-image: executorch-ubuntu-22.04-arm-sdk
diff --git a/.github/workflows/trunk.yml b/.github/workflows/trunk.yml
index ae1b88fb18..18c91691e9 100644
--- a/.github/workflows/trunk.yml
+++ b/.github/workflows/trunk.yml
@@ -131,7 +131,7 @@ jobs:
 
   test-arm-backend-delegation:
     name: test-arm-backend-delegation
-    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     with:
       runner: linux.2xlarge
       docker-image: executorch-ubuntu-22.04-arm-sdk
@@ -157,7 +157,7 @@ jobs:
 
   test-arm-reference-delegation:
     name: test-arm-reference-delegation
-    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     with:
       runner: linux.2xlarge
       docker-image: executorch-ubuntu-22.04-arm-sdk
@@ -290,7 +290,7 @@ jobs:
   #       ${CONDA_RUN} python -m unittest examples.models.llava.test.test_llava
 
   #       # run e2e (export, tokenizer and runner)
-  #       PYTHON_EXECUTABLE=python ${CONDA_RUN} bash .ci/scripts/test_llava.sh Release
+  #       PYTHON_EXECUTABLE=python ${CONDA_RUN} bash .ci/scripts/test_llava.sh
 
   test-qnn-model:
     name: test-qnn-model
@@ -351,6 +351,8 @@ jobs:
         done
 
   test-huggingface-transformers:
+    # NB: Don't run this on fork PRs because they won't have access to the secret and would fail anyway
+    if: ${{ !github.event.pull_request.head.repo.fork }}
     name: test-huggingface-transformers
     uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
     secrets: inherit
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 1649a79aa2..487b2f60bf 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -685,6 +685,22 @@ if(EXECUTORCH_BUILD_PTHREADPOOL
 endif()
 
 if(EXECUTORCH_BUILD_PYBIND)
+  # Setup RPATH.
+  # See https://gitlab.kitware.com/cmake/community/-/wikis/doc/cmake/RPATH-handling
+  if(APPLE)
+    set(CMAKE_MACOSX_RPATH ON)
+    set(_rpath_portable_origin "@loader_path")
+  else()
+    set(_rpath_portable_origin $ORIGIN)
+  endif(APPLE)
+  # Use separate rpaths during build and install phases
+  set(CMAKE_SKIP_BUILD_RPATH  FALSE)
+  # Don't use the install-rpath during the build phase
+  set(CMAKE_BUILD_WITH_INSTALL_RPATH FALSE)
+  set(CMAKE_INSTALL_RPATH "${_rpath_portable_origin}")
+  # Automatically add all linked folders that are NOT in the build directory to
+  # the rpath (per library?)
+  set(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE)
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/third-party/pybind11)
 
   if(NOT EXECUTORCH_BUILD_EXTENSION_DATA_LOADER)
@@ -770,46 +786,6 @@ if(EXECUTORCH_BUILD_PYBIND)
   target_include_directories(portable_lib PRIVATE ${TORCH_INCLUDE_DIRS})
   target_compile_options(portable_lib PUBLIC ${_pybind_compile_options})
   target_link_libraries(portable_lib PRIVATE ${_dep_libs})
-  if(APPLE)
-    # pip wheels will need to be able to find the torch libraries. On Linux, the
-    # .so has non-absolute dependencies on libs like "libtorch.so" without
-    # paths; as long as we `import torch` first, those dependencies will work.
-    # But Apple dylibs do not support non-absolute dependencies, so we need to
-    # tell the loader where to look for its libraries. The LC_LOAD_DYLIB entries
-    # for the torch libraries will look like "@rpath/libtorch.dylib", so we can
-    # add an LC_RPATH entry to look in a directory relative to the installed
-    # location of our _portable_lib.so file. To see these LC_* values, run
-    # `otool -l _portable_lib*.so`.
-    set_target_properties(
-      portable_lib
-      PROPERTIES # Assume that this library will be installed in
-                 # `site-packages/executorch/extension/pybindings`, and that
-                 # the torch libs are in `site-packages/torch/lib`.
-                 BUILD_RPATH "@loader_path/../../../torch/lib"
-                 INSTALL_RPATH "@loader_path/../../../torch/lib"
-                 # Assume <executorch> is the root `site-packages/executorch`
-                 # Need to add <executorch>/extension/llm/custom_ops for
-                 # libcustom_ops_aot_lib.dylib
-                 BUILD_RPATH "@loader_path/../../extension/llm/custom_ops"
-                 INSTALL_RPATH "@loader_path/../../extension/llm/custom_ops"
-                 # Need to add <executorch>/kernels/quantized for
-                 # libquantized_ops_aot_lib.dylib
-                 BUILD_RPATH "@loader_path/../../kernels/quantized"
-                 INSTALL_RPATH "@loader_path/../../kernels/quantized"
-    )
-  else()
-    set_target_properties(
-      portable_lib
-      PROPERTIES
-        # Assume <executorch> is the root `site-packages/executorch`
-        # Need to add <executorch>/extension/llm/custom_ops for
-        # libcustom_ops_aot_lib
-        # Need to add <executorch>/kernels/quantized for
-        # libquantized_ops_aot_lib
-        BUILD_RPATH
-        "$ORIGIN:$ORIGIN/../../extension/llm/custom_ops:$ORIGIN/../../kernels/quantized"
-    )
-  endif()
 
   install(TARGETS portable_lib
           LIBRARY DESTINATION executorch/extension/pybindings
diff --git a/backends/apple/coreml/runtime/test/ETCoreMLModelDebuggerTests.mm b/backends/apple/coreml/runtime/test/ETCoreMLModelDebuggerTests.mm
index 495821544a..014540ad74 100644
--- a/backends/apple/coreml/runtime/test/ETCoreMLModelDebuggerTests.mm
+++ b/backends/apple/coreml/runtime/test/ETCoreMLModelDebuggerTests.mm
@@ -151,7 +151,6 @@ - (void)testMV3ProgramDebugging {
     XCTAssertNotNil(debuggingResults[make_path_with_output_name("aten__native_batch_norm_legit_no_training_default_13_cast_fp16")]);
     XCTAssertNotNil(debuggingResults[make_path_with_output_name("_inversed_aten_div_tensor_24_cast_fp16")]);
     XCTAssertNotNil(debuggingResults[make_path_with_output_name("aten_mean_dim_7_cast_fp16")]);
-    XCTAssertNotNil(debuggingResults[make_path_with_output_name("aten_clamp_default_54_cast_fp16")]);
     XCTAssertNotNil(debuggingResults[make_path_with_output_name("aten__native_batch_norm_legit_no_training_default_22_cast_fp16")]);
     XCTAssertNotNil(debuggingResults[make_path_with_output_name("aten_mul_tensor_27_cast_fp16")]);
 }
diff --git a/backends/apple/coreml/runtime/test/ETCoreMLModelProfilerTests.mm b/backends/apple/coreml/runtime/test/ETCoreMLModelProfilerTests.mm
index 3cc6308579..08fd87b41e 100644
--- a/backends/apple/coreml/runtime/test/ETCoreMLModelProfilerTests.mm
+++ b/backends/apple/coreml/runtime/test/ETCoreMLModelProfilerTests.mm
@@ -146,7 +146,6 @@ - (void)testMV3ProgramProfiling {
             XCTAssertNotNil(profilingResult[make_path_with_output_name("aten__native_batch_norm_legit_no_training_default_13_cast_fp16")]);
             XCTAssertNotNil(profilingResult[make_path_with_output_name("_inversed_aten_div_tensor_24_cast_fp16")]);
             XCTAssertNotNil(profilingResult[make_path_with_output_name("aten_mean_dim_7_cast_fp16")]);
-            XCTAssertNotNil(profilingResult[make_path_with_output_name("aten_clamp_default_54_cast_fp16")]);
             XCTAssertNotNil(profilingResult[make_path_with_output_name("aten__native_batch_norm_legit_no_training_default_22_cast_fp16")]);
             XCTAssertNotNil(profilingResult[make_path_with_output_name("aten_mul_tensor_27_cast_fp16")]);
         };
diff --git a/backends/arm/_passes/cast_int64_pass.py b/backends/arm/_passes/cast_int64_pass.py
index a9952edec3..aab6ed8eb4 100644
--- a/backends/arm/_passes/cast_int64_pass.py
+++ b/backends/arm/_passes/cast_int64_pass.py
@@ -5,8 +5,15 @@
 
 # pyre-unsafe
 
+import logging
+
 import torch
+from executorch.backends.arm._passes.arm_pass_utils import is_param_node
 from executorch.exir.pass_base import ExportPass, PassResult
+from torch._export.utils import is_buffer
+
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.WARNING)
 
 
 class CastInt64ToInt32Pass(ExportPass):
@@ -18,17 +25,31 @@ def _to_int32(self, graph_module: torch.fx.GraphModule):
         for node in graph_module.graph.nodes:
             fake_tensor = node.meta["val"]
             if isinstance(fake_tensor, torch._subclasses.fake_tensor.FakeTensor):
-                if node.meta["val"].dtype == torch.int64:
-                    node.meta["val"] = node.meta["val"].to(torch.int32)
-                    buffer_name = (
-                        self.exported_program.graph_signature.inputs_to_buffers[
-                            node.name
-                        ]
-                    )
-                    new_tensor = self.exported_program.state_dict[buffer_name].to(
-                        torch.int32
-                    )
-                    self.exported_program.state_dict[buffer_name] = new_tensor
+                if node.meta["val"].dtype == torch.int64 and is_param_node(
+                    self.exported_program, node
+                ):
+                    if is_buffer(self.exported_program, node):
+                        node.meta["val"] = node.meta["val"].to(torch.int32)
+                        buffer_name = (
+                            self.exported_program.graph_signature.inputs_to_buffers[
+                                node.name
+                            ]
+                        )
+                        buffer = self.exported_program.state_dict[node.name]
+                        logger.warning(
+                            f"Casting buffer {node.name} from torch.int64 to torch.int32"
+                            f" defined in {node.meta['stack_trace']}"
+                        )
+                        if torch.min(buffer) < torch.iinfo(torch.int32).min:
+                            raise RuntimeError(
+                                f"Buffer {node.name} has value < {torch.iinfo(torch.int32).min}"
+                            )
+                        if torch.max(buffer) > torch.iinfo(torch.int32).max:
+                            raise RuntimeError(
+                                f"Buffer {node.name} has value > {torch.iinfo(torch.int32).max}"
+                            )
+                        buffer_int32 = buffer.to(torch.int32)
+                        self.exported_program.state_dict[buffer_name] = buffer_int32
 
     def call(self, graph_module: torch.fx.GraphModule):
         self._to_int32(graph_module)
diff --git a/backends/arm/_passes/scalars_to_attribute_pass.py b/backends/arm/_passes/scalars_to_attribute_pass.py
index a689799ed6..f6fe02b6eb 100644
--- a/backends/arm/_passes/scalars_to_attribute_pass.py
+++ b/backends/arm/_passes/scalars_to_attribute_pass.py
@@ -51,6 +51,11 @@ def call(self, graph_module: GraphModule) -> PassResult:
                 if isinstance(arg, Node):
                     new_args.append(arg)
                     continue
+                if isinstance(arg, int) and not torch.is_floating_point(
+                    get_first_fake_tensor(n)
+                ):
+                    new_args.append(arg)
+                    continue
 
                 prefix = "_tensor_constant_"
                 get_new_attr_name = get_new_attr_name_with_prefix(prefix)
diff --git a/backends/arm/arm_backend.py b/backends/arm/arm_backend.py
index 59473a9e6d..c59eedc304 100644
--- a/backends/arm/arm_backend.py
+++ b/backends/arm/arm_backend.py
@@ -135,7 +135,9 @@ def set_quantize_io(self, quantize_io: bool = False) -> "ArmCompileSpecBuilder":
         self.quantize_io = quantize_io
         return self
 
-    def set_input_order(self, input_order: str = None) -> "ArmCompileSpecBuilder":
+    def set_input_order(
+        self, input_order: Optional[str] = None
+    ) -> "ArmCompileSpecBuilder":
         """
         Reorder the inputs coming in. This may be required when inputs > 1.
         And while using the U55/U85 CompileSpec.
diff --git a/backends/arm/test/ops/test_avg_pool.py b/backends/arm/test/ops/test_avg_pool.py
index afd079fb95..ad3ddf8c0a 100644
--- a/backends/arm/test/ops/test_avg_pool.py
+++ b/backends/arm/test/ops/test_avg_pool.py
@@ -23,10 +23,10 @@
 
 test_data_suite = [
     # (test_name, test_data, [kernel_size, stride, padding])
-    ("zeros", torch.zeros(20, 16, 50, 32), [4, 2, 0]),
-    ("ones", torch.zeros(20, 16, 50, 32), [4, 2, 0]),
-    ("rand", torch.rand(20, 16, 50, 32), [4, 2, 0]),
-    ("randn", torch.randn(20, 16, 50, 32), [4, 2, 0]),
+    ("zeros", torch.zeros(1, 16, 50, 32), [4, 2, 0]),
+    ("ones", torch.zeros(1, 16, 50, 32), [4, 2, 0]),
+    ("rand", torch.rand(1, 16, 50, 32), [4, 2, 0]),
+    ("randn", torch.randn(1, 16, 50, 32), [4, 2, 0]),
 ]
 
 
@@ -101,7 +101,7 @@ def _test_avgpool2d_tosa_ethos_BI_pipeline(
         test_data: Tuple[torch.tensor],
     ):
         quantizer = ArmQuantizer().set_io(get_symmetric_quantization_config())
-        (
+        tester = (
             ArmTester(
                 module,
                 example_inputs=test_data,
@@ -116,7 +116,10 @@ def _test_avgpool2d_tosa_ethos_BI_pipeline(
             .check_not(["executorch_exir_dialects_edge__ops_aten_avg_pool2d_default"])
             .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
             .to_executorch()
+            .serialize()
         )
+        if common.is_option_enabled("corstone300"):
+            tester.run_method_and_compare_outputs(qtol=1, inputs=test_data)
 
     @parameterized.expand(test_data_suite)
     def test_avgpool2d_tosa_MI(
diff --git a/backends/arm/test/ops/test_bmm.py b/backends/arm/test/ops/test_bmm.py
index 6246657120..824ec46372 100644
--- a/backends/arm/test/ops/test_bmm.py
+++ b/backends/arm/test/ops/test_bmm.py
@@ -41,7 +41,7 @@ def forward(self, x, y):
     class BMMSingleInput(torch.nn.Module):
         test_parameters = [
             (torch.rand(20, 3, 3),),
-            (torch.ones(2, 128, 128),),
+            (torch.rand(2, 128, 128),),
             (10000 * torch.randn(4, 25, 25),),
             (5 + 5 * torch.randn(3, 64, 64),),
         ]
@@ -96,7 +96,7 @@ def _test_bmm_ethosu_BI_pipeline(
         compile_spec: CompileSpec,
         test_data: Tuple[torch.Tensor, ...],
     ):
-        (
+        tester = (
             ArmTester(
                 module,
                 example_inputs=test_data,
@@ -110,7 +110,10 @@ def _test_bmm_ethosu_BI_pipeline(
             .partition()
             .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
             .to_executorch()
+            .serialize()
         )
+        if common.is_option_enabled("corstone300"):
+            tester.run_method_and_compare_outputs(inputs=test_data, qtol=1)
 
     @parameterized.expand(BMM.test_parameters)
     def test_bmm_tosa_MI(self, operand1: torch.Tensor, operand2: torch.Tensor):
@@ -143,9 +146,20 @@ def test_bmm_single_input_tosa_BI(self, operand1: torch.Tensor):
         self._test_bmm_tosa_BI_pipeline(self.BMMSingleInput(), test_data)
 
     @parameterized.expand(BMM.test_parameters)
+    @unittest.expectedFailure
     def test_bmm_u55_BI(self, operand1: torch.Tensor, operand2: torch.Tensor):
         test_data = (operand1, operand2)
-        self._test_bmm_tosa_BI_pipeline(self.BMM(), test_data)
+        self._test_bmm_ethosu_BI_pipeline(
+            self.BMM(), common.get_u55_compile_spec(), test_data
+        )
+
+    @parameterized.expand(BMM.test_parameters)
+    @common.expectedFailureOnFVP
+    def test_bmm_u85_BI(self, operand1: torch.Tensor, operand2: torch.Tensor):
+        test_data = (operand1, operand2)
+        self._test_bmm_ethosu_BI_pipeline(
+            self.BMM(), common.get_u85_compile_spec(), test_data
+        )
 
     # Expected to fail with error: Warning, unsupported fusing of TOSA Rescale previous operator is of type: Memcpy
     @parameterized.expand(BMMSingleInput.test_parameters)
@@ -156,7 +170,9 @@ def test_bmm_single_input_u55_BI(self, operand1: torch.Tensor):
             self.BMMSingleInput(), common.get_u55_compile_spec(), test_data
         )
 
+    # Numerical issues on FVP, MLETORCH 534
     @parameterized.expand(BMMSingleInput.test_parameters)
+    @common.expectedFailureOnFVP
     def test_bmm_single_input_u85_BI(self, operand1: torch.Tensor):
         test_data = (operand1,)
         self._test_bmm_ethosu_BI_pipeline(
diff --git a/backends/arm/test/ops/test_cat.py b/backends/arm/test/ops/test_cat.py
index b380c44d52..88846369d0 100644
--- a/backends/arm/test/ops/test_cat.py
+++ b/backends/arm/test/ops/test_cat.py
@@ -96,7 +96,7 @@ def _test_cat_ethosu_BI_pipeline(
         compile_spec: CompileSpec,
         test_data: Tuple[tuple[torch.Tensor, ...], int],
     ):
-        (
+        tester = (
             ArmTester(
                 module,
                 example_inputs=test_data,
@@ -108,10 +108,14 @@ def _test_cat_ethosu_BI_pipeline(
             .check(["torch.ops.quantized_decomposed"])
             .to_edge()
             .partition()
+            .dump_artifact()
             .check_not(["executorch_exir_dialects_edge__ops_aten_cat_default"])
             .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
             .to_executorch()
+            .serialize()
         )
+        if common.is_option_enabled("corstone300"):
+            tester.run_method_and_compare_outputs(inputs=test_data)
 
     @parameterized.expand(Cat.test_parameters)
     def test_cat_tosa_MI(self, operands: tuple[torch.Tensor, ...], dim: int):
@@ -129,14 +133,18 @@ def test_cat_tosa_BI(self, operands: tuple[torch.Tensor, ...], dim: int):
         test_data = (operands, dim)
         self._test_cat_tosa_BI_pipeline(self.Cat(), test_data)
 
+    # Mismatch in provided number of inputs and model signature, MLETORCH 519
     @parameterized.expand(Cat.test_parameters)
+    @common.expectedFailureOnFVP
     def test_cat_u55_BI(self, operands: tuple[torch.Tensor, ...], dim: int):
         test_data = (operands, dim)
         self._test_cat_ethosu_BI_pipeline(
             self.Cat(), common.get_u55_compile_spec(), test_data
         )
 
+    # Mismatch in provided number of inputs and model signature, MLETORCH 519
     @parameterized.expand(Cat.test_parameters)
+    @common.expectedFailureOnFVP
     def test_cat_u85_BI(self, operands: tuple[torch.Tensor, ...], dim: int):
         test_data = (operands, dim)
         self._test_cat_ethosu_BI_pipeline(
diff --git a/backends/arm/test/ops/test_clone.py b/backends/arm/test/ops/test_clone.py
index 4721f257b0..6b5216a8e1 100644
--- a/backends/arm/test/ops/test_clone.py
+++ b/backends/arm/test/ops/test_clone.py
@@ -85,7 +85,7 @@ def _test_clone_tosa_ethos_pipeline(
         test_data: Tuple[torch.Tensor],
     ):
         quantizer = ArmQuantizer().set_io(get_symmetric_quantization_config())
-        (
+        tester = (
             ArmTester(module, example_inputs=test_data, compile_spec=compile_spec)
             .quantize(Quantize(quantizer, get_symmetric_quantization_config()))
             .export()
@@ -94,7 +94,10 @@ def _test_clone_tosa_ethos_pipeline(
             .partition()
             .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
             .to_executorch()
+            .serialize()
         )
+        if common.is_option_enabled("corstone300"):
+            tester.run_method_and_compare_outputs(qtol=1, inputs=test_data)
 
     def _test_clone_tosa_u55_pipeline(
         self, module: torch.nn.Module, test_data: Tuple[torch.Tensor]
diff --git a/backends/arm/test/ops/test_conv1d.py b/backends/arm/test/ops/test_conv1d.py
index 133148faef..f00c7984a1 100644
--- a/backends/arm/test/ops/test_conv1d.py
+++ b/backends/arm/test/ops/test_conv1d.py
@@ -268,7 +268,7 @@ def _test_conv1d_ethosu_BI_pipeline(
         compile_spec: CompileSpec,
         test_data: Tuple[torch.Tensor],
     ):
-        (
+        tester = (
             ArmTester(module, example_inputs=test_data, compile_spec=compile_spec)
             .quantize()
             .export()
@@ -277,7 +277,10 @@ def _test_conv1d_ethosu_BI_pipeline(
             .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
             .check_not(["executorch_exir_dialects_edge__ops_aten_convolution_default"])
             .to_executorch()
+            .serialize()
         )
+        if common.is_option_enabled("corstone300"):
+            tester.run_method_and_compare_outputs(qtol=1, inputs=test_data)
 
     @parameterized.expand(testsuite)
     def test_conv1d_tosa_MI(self, test_name, model):
@@ -295,6 +298,9 @@ def test_conv1d_u55_BI(self, test_name, model):
             model, common.get_u55_compile_spec(), model.get_inputs()
         )
 
+    # This specific test case has numerical errors on FVP, MLETORCH-520.
+    testsuite.remove(("5_3x2x128_st1", conv1d_5_3x2x128_st1))
+
     @parameterized.expand(testsuite)
     def test_conv1d_u85_BI(self, test_name, model):
         self._test_conv1d_ethosu_BI_pipeline(
diff --git a/backends/arm/test/ops/test_conv2d.py b/backends/arm/test/ops/test_conv2d.py
index 43c3e85139..21df4bf0d5 100644
--- a/backends/arm/test/ops/test_conv2d.py
+++ b/backends/arm/test/ops/test_conv2d.py
@@ -295,7 +295,7 @@ def _test_conv2d_ethosu_BI_pipeline(
         module: torch.nn.Module,
         test_data: Tuple[torch.Tensor],
     ):
-        (
+        tester = (
             ArmTester(
                 module,
                 example_inputs=test_data,
@@ -308,7 +308,10 @@ def _test_conv2d_ethosu_BI_pipeline(
             .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
             .check_not(["executorch_exir_dialects_edge__ops_aten_convolution_default"])
             .to_executorch()
+            .serialize()
         )
+        if common.is_option_enabled("corstone300"):
+            tester.run_method_and_compare_outputs(qtol=1, inputs=test_data)
 
     @parameterized.expand(testsuite)
     def test_conv2d_tosa_MI(self, test_name, model):
@@ -318,6 +321,10 @@ def test_conv2d_tosa_MI(self, test_name, model):
     def test_conv2d_tosa_BI(self, test_name, model):
         self._test_conv2d_tosa_BI_pipeline(model, model.get_inputs())
 
+    # These cases have numerical issues on FVP, MLETORCH-520
+    testsuite.remove(("2x2_3x2x40x40_nobias", conv2d_2x2_3x2x40x40_nobias))
+    testsuite.remove(("5x5_3x2x128x128_st1", conv2d_5x5_3x2x128x128_st1))
+
     @parameterized.expand(testsuite)
     def test_conv2d_u55_BI(self, test_name, model):
         self._test_conv2d_ethosu_BI_pipeline(
diff --git a/backends/arm/test/ops/test_conv_combos.py b/backends/arm/test/ops/test_conv_combos.py
index 3e9bdef958..7555fff720 100644
--- a/backends/arm/test/ops/test_conv_combos.py
+++ b/backends/arm/test/ops/test_conv_combos.py
@@ -238,7 +238,7 @@ def _test_conv_combo_ethos_BI_pipeline(
         compile_spec: CompileSpec,
         test_data: Tuple[torch.Tensor],
     ):
-        (
+        tester = (
             ArmTester(
                 module,
                 example_inputs=test_data,
@@ -251,7 +251,10 @@ def _test_conv_combo_ethos_BI_pipeline(
             .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
             .check_not(list(module.edge_op_list))
             .to_executorch()
+            .serialize()
         )
+        if common.is_option_enabled("corstone300"):
+            tester.run_method_and_compare_outputs(qtol=1, inputs=test_data)
 
     ####################
     ## Conv + meandim ##
@@ -272,6 +275,8 @@ def test_conv_meandim_u55_BI(self):
             model.get_inputs(),
         )
 
+    # Numerical Issues on FVP, MLETORCH-520
+    @common.expectedFailureOnFVP
     def test_conv_meandim_u85_BI(self):
         model = ComboConv2dMeandim()
         self._test_conv_combo_ethos_BI_pipeline(
diff --git a/backends/arm/test/ops/test_depthwise_conv.py b/backends/arm/test/ops/test_depthwise_conv.py
index 4bfa863c49..28cb9ac844 100644
--- a/backends/arm/test/ops/test_depthwise_conv.py
+++ b/backends/arm/test/ops/test_depthwise_conv.py
@@ -8,8 +8,6 @@
 
 from typing import Tuple
 
-import pytest
-
 import torch
 from executorch.backends.arm.test import common
 from executorch.backends.arm.test.ops.test_conv1d import Conv1d
@@ -160,8 +158,8 @@
 
 testsuite_conv1d = [
     ("2_1x6x4_gp6_st1", dw_conv1d_2_1x6x4_gp6_st1),
-    ("3_1x3x256_gp3_st1", dw_conv1d_3_1x3x256_gp3_st1),
     ("two_dw_conv1d", two_dw_conv1d),
+    ("3_1x3x256_gp3_st1", dw_conv1d_3_1x3x256_gp3_st1),
     ("3_1x3x14_gp3_st1", dw_conv1d_3_1x3x14_gp3_st1),
 ]
 
@@ -217,7 +215,7 @@ def _test_dw_conv_ethos_BI_pipeline(
         compile_spec: CompileSpec,
         test_data: Tuple[torch.Tensor],
     ):
-        (
+        tester = (
             ArmTester(
                 module,
                 example_inputs=test_data,
@@ -230,7 +228,10 @@ def _test_dw_conv_ethos_BI_pipeline(
             .check_not(["executorch_exir_dialects_edge__ops_aten_convolution_default"])
             .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
             .to_executorch()
+            .serialize()
         )
+        if common.is_option_enabled("corstone300"):
+            tester.run_method_and_compare_outputs(qtol=1, inputs=test_data)
 
     @parameterized.expand(testsuite_conv1d + testsuite_conv2d)
     def test_dw_conv_tosa_MI(self, test_name: str, model: torch.nn.Module):
@@ -238,11 +239,15 @@ def test_dw_conv_tosa_MI(self, test_name: str, model: torch.nn.Module):
 
     # TODO: Investigate flakyness (MLTORCH-307)
     @parameterized.expand(testsuite_conv1d + testsuite_conv2d)
-    @pytest.mark.flaky(reruns=3)
     def test_dw_conv_tosa_BI(self, test_name: str, model: torch.nn.Module):
         self._test_dw_conv_tosa_BI_pipeline(model, model.get_inputs())
 
+    testsuite_conv2d.remove(
+        ("3x3_1x3x256x256_gp3_st1", dw_conv2d_3x3_1x3x256x256_gp3_st1)
+    )  # Works
+
     @parameterized.expand(testsuite_conv2d, skip_on_empty=True)
+    @common.expectedFailureOnFVP
     def test_dw_conv2d_u55_BI(
         self, test_name: str, model: torch.nn.Module, set_quantize_io: bool = False
     ):
@@ -269,7 +274,21 @@ def test_dw_conv1d_u55_BI(
             model.get_inputs(),
         )
 
-    @parameterized.expand(testsuite_conv1d + testsuite_conv2d)
+    # All test cases except 3x3_1x3x256x256_gp3_st1 have numerical issues on FVP. MLETORCH-520
+    @parameterized.expand(testsuite_conv1d[:-2] + testsuite_conv2d)
+    @common.expectedFailureOnFVP
+    def test_dw_conv_u85_BI_xfails(
+        self, test_name: str, model: torch.nn.Module, set_quantize_io: bool = False
+    ):
+        self._test_dw_conv_ethos_BI_pipeline(
+            model,
+            common.get_u85_compile_spec(
+                permute_memory_to_nhwc=True, quantize_io=set_quantize_io
+            ),
+            model.get_inputs(),
+        )
+
+    @parameterized.expand(testsuite_conv1d[-2:])
     def test_dw_conv_u85_BI(
         self, test_name: str, model: torch.nn.Module, set_quantize_io: bool = False
     ):
diff --git a/backends/arm/test/ops/test_div.py b/backends/arm/test/ops/test_div.py
index 28cc686690..b3815f3e7c 100644
--- a/backends/arm/test/ops/test_div.py
+++ b/backends/arm/test/ops/test_div.py
@@ -136,10 +136,10 @@ def _test_div_tosa_BI_pipeline(
             .run_method_and_compare_outputs(inputs=test_data, atol=1, rtol=0.1)
         )
 
-    def _test_div_u55_BI_pipeline(
-        self, module: torch.nn.Module, test_data: Tuple[torch.Tensor]
+    def _test_div_ethos_BI_pipeline(
+        self, module: torch.nn.Module, compile_spec, test_data: Tuple[torch.Tensor]
     ):
-        (
+        tester = (
             ArmTester(
                 module,
                 example_inputs=test_data,
@@ -155,7 +155,10 @@ def _test_div_u55_BI_pipeline(
             .partition()
             .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
             .to_executorch()
+            .serialize()
         )
+        if common.is_option_enabled("corstone300"):
+            tester.run_method_and_compare_outputs(qtol=1, inputs=test_data)
 
     @parameterized.expand(test_data_suite)
     def test_div_tosa_MI(
@@ -180,7 +183,9 @@ def test_div_tosa_BI(
         test_data = (input_, other_)
         self._test_div_tosa_BI_pipeline(self.Div(), test_data)
 
+    # Numerical issues on FVP likely due to mul op, MLETORCH-521
     @parameterized.expand(test_data_suite)
+    @common.expectedFailureOnFVP
     def test_div_u55_BI(
         self,
         test_name: str,
@@ -189,4 +194,21 @@ def test_div_u55_BI(
         rounding_mode: Optional[str] = None,
     ):
         test_data = (input_, other_)
-        self._test_div_u55_BI_pipeline(self.Div(), test_data)
+        self._test_div_ethos_BI_pipeline(
+            self.Div(), common.get_u55_compile_spec(), test_data
+        )
+
+    # Numerical issues on FVP likely due to mul op, MLETORCH-521
+    @parameterized.expand(test_data_suite)
+    @common.expectedFailureOnFVP
+    def test_div_u85_BI(
+        self,
+        test_name: str,
+        input_: Union[torch.Tensor, torch.types.Number],
+        other_: Union[torch.Tensor, torch.types.Number],
+        rounding_mode: Optional[str] = None,
+    ):
+        test_data = (input_, other_)
+        self._test_div_ethos_BI_pipeline(
+            self.Div(), common.get_u85_compile_spec(), test_data
+        )
diff --git a/backends/arm/test/ops/test_exp.py b/backends/arm/test/ops/test_exp.py
index c706b7b206..f33e0a9058 100644
--- a/backends/arm/test/ops/test_exp.py
+++ b/backends/arm/test/ops/test_exp.py
@@ -20,7 +20,7 @@
     ("zeros", torch.zeros(1, 10, 10, 10)),
     ("ones", torch.ones(10, 10, 10)),
     ("rand", torch.rand(10, 10) - 0.5),
-    ("randn_pos", torch.randn(10) + 10),
+    ("randn_pos", torch.randn(1, 4, 4, 4) + 10),
     ("randn_neg", torch.randn(10) - 10),
     ("ramp", torch.arange(-16, 16, 0.2)),
 ]
@@ -78,7 +78,7 @@ def _test_exp_ethosu_BI_pipeline(
         module: torch.nn.Module,
         test_data: Tuple[torch.tensor],
     ):
-        (
+        tester = (
             ArmTester(
                 module,
                 example_inputs=test_data,
@@ -93,7 +93,10 @@ def _test_exp_ethosu_BI_pipeline(
             .check_not(["executorch_exir_dialects_edge__ops_aten_exp_default"])
             .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
             .to_executorch()
+            .serialize()
         )
+        if common.is_option_enabled("corstone300"):
+            tester.run_method_and_compare_outputs(qtol=1, inputs=test_data)
 
     @parameterized.expand(test_data_suite)
     def test_exp_tosa_MI(
diff --git a/backends/arm/test/ops/test_expand.py b/backends/arm/test/ops/test_expand.py
index effa7ce713..27f311b546 100644
--- a/backends/arm/test/ops/test_expand.py
+++ b/backends/arm/test/ops/test_expand.py
@@ -81,7 +81,7 @@ def _test_expand_ethosu_BI_pipeline(
         self, compile_spec: CompileSpec, module: torch.nn.Module, test_data: Tuple
     ):
         quantizer = ArmQuantizer().set_io(get_symmetric_quantization_config())
-        (
+        tester = (
             ArmTester(
                 module,
                 example_inputs=test_data,
@@ -95,7 +95,10 @@ def _test_expand_ethosu_BI_pipeline(
             .check_not(["torch.ops.aten.expand.default"])
             .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
             .to_executorch()
+            .serialize()
         )
+        if common.is_option_enabled("corstone300"):
+            tester.run_method_and_compare_outputs(qtol=1, inputs=test_data)
 
     @parameterized.expand(Expand.test_parameters)
     def test_expand_tosa_MI(self, test_input, multiples):
@@ -105,13 +108,17 @@ def test_expand_tosa_MI(self, test_input, multiples):
     def test_expand_tosa_BI(self, test_input, multiples):
         self._test_expand_tosa_BI_pipeline(self.Expand(), (test_input, multiples))
 
+    # Mismatch in provided number of inputs and model signature, MLETORCH 519
     @parameterized.expand(Expand.test_parameters)
+    @common.expectedFailureOnFVP
     def test_expand_u55_BI(self, test_input, multiples):
         self._test_expand_ethosu_BI_pipeline(
             common.get_u55_compile_spec(), self.Expand(), (test_input, multiples)
         )
 
+    # Mismatch in provided number of inputs and model signature, MLETORCH 519
     @parameterized.expand(Expand.test_parameters)
+    @common.expectedFailureOnFVP
     def test_expand_u85_BI(self, test_input, multiples):
         self._test_expand_ethosu_BI_pipeline(
             common.get_u85_compile_spec(), self.Expand(), (test_input, multiples)
diff --git a/backends/arm/test/ops/test_full.py b/backends/arm/test/ops/test_full.py
index d4cfc5c369..9857a7b87b 100644
--- a/backends/arm/test/ops/test_full.py
+++ b/backends/arm/test/ops/test_full.py
@@ -97,7 +97,7 @@ def _test_full_tosa_BI_pipeline(
     def _test_full_tosa_ethos_pipeline(
         self, compile_spec: list[CompileSpec], module: torch.nn.Module, test_data: Tuple
     ):
-        (
+        tester = (
             ArmTester(module, example_inputs=test_data, compile_spec=compile_spec)
             .quantize()
             .export()
@@ -107,7 +107,10 @@ def _test_full_tosa_ethos_pipeline(
             .check_not(["executorch_exir_dialects_edge__ops_aten_full_default"])
             .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
             .to_executorch()
+            .serialize()
         )
+        if common.is_option_enabled("corstone300"):
+            tester.run_method_and_compare_outputs(qtol=1, inputs=test_data)
 
     def _test_full_tosa_u55_pipeline(self, module: torch.nn.Module, test_data: Tuple):
         self._test_full_tosa_ethos_pipeline(
@@ -140,14 +143,18 @@ def test_full_tosa_MI(self, test_tensor: Tuple):
     def test_full_tosa_BI(self, test_tensor: Tuple):
         self._test_full_tosa_BI_pipeline(self.AddVariableFull(), test_tensor, False)
 
+    # Mismatch in provided number of inputs and model signature, MLETORCH 519
     @parameterized.expand(AddVariableFull.test_parameters)
+    @common.expectedFailureOnFVP
     def test_full_u55_BI(self, test_tensor: Tuple):
         self._test_full_tosa_u55_pipeline(
             self.AddVariableFull(),
             test_tensor,
         )
 
+    # Mismatch in provided number of inputs and model signature, MLETORCH 519
     @parameterized.expand(AddVariableFull.test_parameters)
+    @common.expectedFailureOnFVP
     def test_full_u85_BI(self, test_tensor: Tuple):
         self._test_full_tosa_u85_pipeline(
             self.AddVariableFull(),
diff --git a/backends/arm/test/ops/test_hardtanh.py b/backends/arm/test/ops/test_hardtanh.py
index a9f12abdf0..10073c5095 100644
--- a/backends/arm/test/ops/test_hardtanh.py
+++ b/backends/arm/test/ops/test_hardtanh.py
@@ -87,15 +87,15 @@ def _test_hardtanh_tosa_BI_pipeline(
             .run_method_and_compare_outputs(inputs=test_data)
         )
 
-    def _test_hardtanh_tosa_u55_BI_pipeline(
-        self, module: torch.nn.Module, test_data: Tuple[torch.tensor]
+    def _test_hardtanh_tosa_ethosu_BI_pipeline(
+        self, compile_spec, module: torch.nn.Module, test_data: Tuple[torch.tensor]
     ):
         quantizer = ArmQuantizer().set_io(get_symmetric_quantization_config())
-        (
+        tester = (
             ArmTester(
                 module,
                 example_inputs=test_data,
-                compile_spec=common.get_u55_compile_spec(),
+                compile_spec=compile_spec,
             )
             .quantize(Quantize(quantizer, get_symmetric_quantization_config()))
             .export()
@@ -106,7 +106,10 @@ def _test_hardtanh_tosa_u55_BI_pipeline(
             .check_not(["executorch_exir_dialects_edge__ops_aten_hardtanh_default"])
             .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
             .to_executorch()
+            .serialize()
         )
+        if common.is_option_enabled("corstone300"):
+            tester.run_method_and_compare_outputs(qtol=1, inputs=test_data)
 
     @parameterized.expand(test_data_suite)
     def test_hardtanh_tosa_MI(
@@ -122,4 +125,12 @@ def test_hardtanh_tosa_BI(self, test_name: str, test_data: torch.Tensor):
 
     @parameterized.expand(test_data_suite)
     def test_hardtanh_tosa_u55_BI(self, test_name: str, test_data: torch.Tensor):
-        self._test_hardtanh_tosa_u55_BI_pipeline(self.HardTanh(), (test_data,))
+        self._test_hardtanh_tosa_ethosu_BI_pipeline(
+            common.get_u55_compile_spec(), self.HardTanh(), (test_data,)
+        )
+
+    @parameterized.expand(test_data_suite)
+    def test_hardtanh_tosa_u85_BI(self, test_name: str, test_data: torch.Tensor):
+        self._test_hardtanh_tosa_ethosu_BI_pipeline(
+            common.get_u85_compile_spec(), self.HardTanh(), (test_data,)
+        )
diff --git a/backends/arm/test/ops/test_layer_norm.py b/backends/arm/test/ops/test_layer_norm.py
index f059d71eba..0b06044a59 100644
--- a/backends/arm/test/ops/test_layer_norm.py
+++ b/backends/arm/test/ops/test_layer_norm.py
@@ -115,7 +115,7 @@ def _test_layernorm_ethosu_BI_pipeline(
         compile_spec: CompileSpec,
         test_data: Tuple[torch.Tensor],
     ):
-        (
+        tester = (
             ArmTester(
                 model=module,
                 example_inputs=test_data,
@@ -128,7 +128,10 @@ def _test_layernorm_ethosu_BI_pipeline(
             .partition()
             .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
             .to_executorch()
+            .serialize()
         )
+        if common.is_option_enabled("corstone300"):
+            tester.run_method_and_compare_outputs(qtol=1, inputs=test_data)
 
     @parameterized.expand(test_data_suite)
     def test_layer_norm_tosa_MI(
@@ -152,8 +155,10 @@ def test_layer_norm_tosa_BI(
             self.LayerNorm(*model_params), (test_data,)
         )
 
+    # Numerical issues on FVP likely due to mul op, MLETORCH-521
     # Skip tests that require transposes.
     @parameterized.expand(test_data_suite[:-2])
+    @common.expectedFailureOnFVP
     def test_layer_norm_u55_BI(
         self,
         test_name: str,
@@ -164,7 +169,21 @@ def test_layer_norm_u55_BI(
             self.LayerNorm(*model_params), common.get_u55_compile_spec(), (test_data,)
         )
 
-    @parameterized.expand(test_data_suite)
+    # Numerical issues on FVP likely due to mul op, MLETORCH-521
+    @parameterized.expand(test_data_suite[:-1])
+    @common.expectedFailureOnFVP
+    def test_layer_norm_u85_BI_fvp_xfails(
+        self,
+        test_name: str,
+        test_data: torch.Tensor,
+        model_params,
+    ):
+        self._test_layernorm_ethosu_BI_pipeline(
+            self.LayerNorm(*model_params), common.get_u85_compile_spec(), (test_data,)
+        )
+
+    @parameterized.expand(test_data_suite[-1:])
+    @unittest.skip  # Flaky
     def test_layer_norm_u85_BI(
         self,
         test_name: str,
diff --git a/backends/arm/test/ops/test_log.py b/backends/arm/test/ops/test_log.py
index 847635ea36..10175d27fb 100644
--- a/backends/arm/test/ops/test_log.py
+++ b/backends/arm/test/ops/test_log.py
@@ -78,7 +78,7 @@ def _test_log_ethosu_BI_pipeline(
         module: torch.nn.Module,
         test_data: Tuple[torch.tensor],
     ):
-        (
+        tester = (
             ArmTester(
                 module,
                 example_inputs=test_data,
@@ -93,7 +93,10 @@ def _test_log_ethosu_BI_pipeline(
             .check_not(["executorch_exir_dialects_edge__ops_aten_log_default"])
             .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
             .to_executorch()
+            .serialize()
         )
+        if common.is_option_enabled("corstone300"):
+            tester.run_method_and_compare_outputs(qtol=1, inputs=test_data)
 
     @parameterized.expand(test_data_suite)
     def test_log_tosa_MI(
diff --git a/backends/arm/test/ops/test_mul.py b/backends/arm/test/ops/test_mul.py
index 7fa20c2566..8f0321ea5f 100644
--- a/backends/arm/test/ops/test_mul.py
+++ b/backends/arm/test/ops/test_mul.py
@@ -16,9 +16,9 @@
 test_data_sute = [
     # (test_name, input, other,) See torch.mul() for info
     (
-        "op_mul_rank1_ones",
-        torch.ones(5),
-        torch.ones(5),
+        "op_mul_rank1_rand",
+        torch.rand(5) * 3.7,
+        torch.rand(5) * 1.5,
     ),
     (
         "op_mul_rank2_rand",
@@ -32,23 +32,23 @@
     ),
     (
         "op_mul_rank4_randn",
-        torch.randn(5, 10, 25, 20),
-        torch.randn(5, 10, 25, 20),
+        torch.randn(1, 10, 25, 20),
+        torch.randn(1, 10, 25, 20),
     ),
     (
         "op_mul_rank4_ones_mul_negative",
         torch.ones(1, 10, 25, 20),
-        (-1) * torch.ones(5, 10, 25, 20),
+        (-1) * torch.ones(1, 10, 25, 20),
     ),
     (
         "op_mul_rank4_negative_large_rand",
-        (-200) * torch.rand(5, 10, 25, 20),
-        torch.rand(5, 1, 1, 20),
+        (-200) * torch.rand(1, 10, 25, 20),
+        torch.rand(1, 1, 1, 20),
     ),
     (
         "op_mul_rank4_large_randn",
-        200 * torch.randn(5, 10, 25, 20),
-        torch.rand(5, 10, 25, 1),
+        200 * torch.randn(1, 10, 25, 20),
+        torch.rand(1, 10, 25, 1),
     ),
 ]
 
@@ -112,7 +112,7 @@ def _test_mul_ethosu_BI_pipeline(
         module: torch.nn.Module,
         test_data: tuple[torch.Tensor, torch.Tensor],
     ):
-        (
+        tester = (
             ArmTester(
                 module,
                 example_inputs=test_data,
@@ -126,7 +126,10 @@ def _test_mul_ethosu_BI_pipeline(
             .partition()
             .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
             .to_executorch()
+            .serialize()
         )
+        if common.is_option_enabled("corstone300"):
+            tester.run_method_and_compare_outputs(qtol=1, inputs=test_data)
 
     @parameterized.expand(test_data_sute)
     def test_mul_tosa_MI(
@@ -149,7 +152,9 @@ def test_mul_tosa_BI(
         test_data = (input_, other_)
         self._test_mul_tosa_BI_pipeline(self.Mul(), test_data)
 
+    # Numerical issues on FVP, MLETORCH-521
     @parameterized.expand(test_data_sute)
+    @common.expectedFailureOnFVP
     def test_mul_u55_BI(
         self,
         test_name: str,
@@ -161,7 +166,10 @@ def test_mul_u55_BI(
             common.get_u55_compile_spec(), self.Mul(), test_data
         )
 
-    @parameterized.expand(test_data_sute)
+    # Numerical issues on FVP, MLETORCH-521
+    # test_data_sute[0] works on U85
+    @parameterized.expand(test_data_sute[1:])
+    @common.expectedFailureOnFVP
     def test_mul_u85_BI(
         self,
         test_name: str,
diff --git a/backends/arm/test/ops/test_permute.py b/backends/arm/test/ops/test_permute.py
index 62b6b823de..92400215b7 100644
--- a/backends/arm/test/ops/test_permute.py
+++ b/backends/arm/test/ops/test_permute.py
@@ -100,7 +100,7 @@ def _test_permute_ethos_BI_pipeline(
         test_data: Tuple[torch.Tensor],
     ):
         quantizer = ArmQuantizer().set_io(get_symmetric_quantization_config())
-        (
+        tester = (
             ArmTester(
                 module,
                 example_inputs=test_data,
@@ -117,6 +117,8 @@ def _test_permute_ethos_BI_pipeline(
             .to_executorch()
             .serialize()
         )
+        if common.is_option_enabled("corstone300"):
+            tester.run_method_and_compare_outputs(qtol=1, inputs=test_data)
 
     @parameterized.expand(test_data_suite)
     def test_permute_tosa_MI(
@@ -143,10 +145,20 @@ def test_permute_u55_BI(
             self.Permute(dims=dims), common.get_u55_compile_spec(), (test_data,)
         )
 
-    @parameterized.expand(test_data_suite)
+    @parameterized.expand(test_data_suite[:-2])
     def test_permute_u85_BI(
         self, test_name: str, test_data: torch.Tensor, dims: list[int]
     ):
         self._test_permute_ethos_BI_pipeline(
             self.Permute(dims=dims), common.get_u85_compile_spec(), (test_data,)
         )
+
+    # Fails since on FVP since N > 1 is not supported. MLETORCH-517
+    @parameterized.expand(test_data_suite[-2:])
+    @common.expectedFailureOnFVP
+    def test_permute_u85_BI_xfails(
+        self, test_name: str, test_data: torch.Tensor, dims: list[int]
+    ):
+        self._test_permute_ethos_BI_pipeline(
+            self.Permute(dims=dims), common.get_u85_compile_spec(), (test_data,)
+        )
diff --git a/backends/arm/test/ops/test_reciprocal.py b/backends/arm/test/ops/test_reciprocal.py
index 7745a614e6..876f063c76 100644
--- a/backends/arm/test/ops/test_reciprocal.py
+++ b/backends/arm/test/ops/test_reciprocal.py
@@ -22,12 +22,12 @@
         torch.rand(5) * 5,
     ),
     ("op_reciprocal_rank1_negative_ones", torch.ones(5) * (-1)),
-    ("op_reciprocal_rank4_ones", torch.ones(5, 10, 25, 20)),
-    ("op_reciprocal_rank4_negative_ones", (-1) * torch.ones(5, 10, 25, 20)),
-    ("op_reciprocal_rank4_ones_reciprocal_negative", torch.ones(5, 10, 25, 20)),
-    ("op_reciprocal_rank4_large_rand", 200 * torch.rand(5, 10, 25, 20)),
-    ("op_reciprocal_rank4_negative_large_rand", (-200) * torch.rand(5, 10, 25, 20)),
-    ("op_reciprocal_rank4_large_randn", 200 * torch.randn(5, 10, 25, 20) + 1),
+    ("op_reciprocal_rank4_ones", torch.ones(1, 10, 25, 20)),
+    ("op_reciprocal_rank4_negative_ones", (-1) * torch.ones(1, 10, 25, 20)),
+    ("op_reciprocal_rank4_ones_reciprocal_negative", torch.ones(1, 10, 25, 20)),
+    ("op_reciprocal_rank4_large_rand", 200 * torch.rand(1, 10, 25, 20)),
+    ("op_reciprocal_rank4_negative_large_rand", (-200) * torch.rand(1, 10, 25, 20)),
+    ("op_reciprocal_rank4_large_randn", 200 * torch.randn(1, 10, 25, 20) + 1),
 ]
 
 
@@ -81,7 +81,7 @@ def _test_reciprocal_tosa_BI_pipeline(
     def _test_reciprocal_u55_BI_pipeline(
         self, module: torch.nn.Module, test_data: tuple[torch.Tensor]
     ):
-        (
+        tester = (
             ArmTester(
                 module,
                 example_inputs=test_data,
@@ -95,15 +95,16 @@ def _test_reciprocal_u55_BI_pipeline(
             .partition()
             .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
             .to_executorch()
+            .serialize()
         )
+        if common.is_option_enabled("corstone300"):
+            tester.run_method_and_compare_outputs(qtol=1, inputs=test_data)
 
     @parameterized.expand(test_data_suite)
     def test_reciprocal_tosa_MI(self, test_name: str, input_: torch.Tensor):
         test_data = (input_,)
         self._test_reciprocal_tosa_MI_pipeline(self.Reciprocal(), test_data)
 
-    # Expected to fail since ArmQuantizer cannot quantize a Reciprocal layer
-    # TODO(MLETORCH-129)
     @parameterized.expand(test_data_suite)
     def test_reciprocal_tosa_BI(self, test_name: str, input_: torch.Tensor):
 
diff --git a/backends/arm/test/ops/test_scalars.py b/backends/arm/test/ops/test_scalars.py
index 86433745a6..cd3dd72f60 100644
--- a/backends/arm/test/ops/test_scalars.py
+++ b/backends/arm/test/ops/test_scalars.py
@@ -75,6 +75,12 @@ def forward(self, x):
             x = 1.0 + x
             return x
 
+    class ShiftInplaceSub(torch.nn.Module):
+        def forward(self, x):
+            x = x >> 4
+            x -= 10
+            return x
+
     # Inplace ops end with '_' (from aten naming)
     ops = [
         ("Add", Add()),
@@ -160,3 +166,6 @@ def test_MI_const(self, test_name: str, op: torch.nn.Module, x):
     @parameterized.expand(tensor_scalar_tests)
     def test_BI(self, test_name: str, op: torch.nn.Module, x, y):
         self._test_add_tosa_BI_pipeline(op, (x, y))
+
+    def test_shift_sub_inplace_tosa_MI(self):
+        self._test_add_tosa_MI_pipeline(self.ShiftInplaceSub(), (torch.IntTensor(5),))
diff --git a/backends/arm/test/ops/test_sub.py b/backends/arm/test/ops/test_sub.py
index 5c67240e52..327a8de994 100644
--- a/backends/arm/test/ops/test_sub.py
+++ b/backends/arm/test/ops/test_sub.py
@@ -17,7 +17,7 @@
 from parameterized import parameterized
 
 
-class TestSimpleSub(unittest.TestCase):
+class TestSub(unittest.TestCase):
     class Sub(torch.nn.Module):
         test_parameters = [
             (torch.ones(5),),
@@ -82,7 +82,7 @@ def _test_sub_ethosu_BI_pipeline(
         module: torch.nn.Module,
         test_data: Tuple[torch.Tensor],
     ):
-        (
+        tester = (
             ArmTester(
                 module,
                 example_inputs=test_data,
@@ -96,7 +96,10 @@ def _test_sub_ethosu_BI_pipeline(
             .partition()
             .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
             .to_executorch()
+            .serialize()
         )
+        if common.is_option_enabled("corstone300"):
+            tester.run_method_and_compare_outputs(qtol=1, inputs=test_data)
 
     @parameterized.expand(Sub.test_parameters)
     def test_sub_tosa_MI(self, test_data: torch.Tensor):
diff --git a/backends/arm/test/runner_utils.py b/backends/arm/test/runner_utils.py
index 5940067af6..b61c1b465f 100644
--- a/backends/arm/test/runner_utils.py
+++ b/backends/arm/test/runner_utils.py
@@ -266,8 +266,6 @@ def run_corstone(
                 "-C",
                 "mps3_board.uart0.out_file='-'",
                 "-C",
-                "cpu0.CFGITCMSZ=11",
-                "-C",
                 "cpu0.semihosting-enable=1",
                 "-C",
                 "cpu0.semihosting-stack_base=0",
diff --git a/backends/cadence/aot/TARGETS b/backends/cadence/aot/TARGETS
index 24b0266911..661f8cf0d4 100644
--- a/backends/cadence/aot/TARGETS
+++ b/backends/cadence/aot/TARGETS
@@ -50,6 +50,26 @@ python_library(
     ],
 )
 
+python_library(
+    name = "export_example",
+    srcs = [
+        "export_example.py",
+    ],
+    deps = [
+        ":passes",
+        ":utils",
+        ":ops_registrations",
+        ":replace_ops",
+        "//caffe2:torch",
+        "//executorch/backends/cadence/aot/quantizer:fusion_pass",
+        "//executorch/backends/cadence/runtime:runtime",
+        "//executorch/backends/cadence/aot/quantizer:quantizer",
+        "//executorch/backends/transforms:decompose_sdpa",
+        "//executorch/backends/transforms:remove_clone_ops",
+        "//executorch/exir:lib",
+        "//executorch/devtools:lib",
+    ],
+)
 
 python_library(
     name = "pass_utils",
diff --git a/backends/cadence/aot/export_example.py b/backends/cadence/aot/export_example.py
index 146d4f806c..4ba5bffc96 100644
--- a/backends/cadence/aot/export_example.py
+++ b/backends/cadence/aot/export_example.py
@@ -60,6 +60,7 @@ def export_model(
     model: nn.Module,
     example_inputs: Tuple[Any, ...],
     file_name: str = "CadenceDemoModel",
+    run_and_compare: bool = True,
 ):
     # create work directory for outputs and model binary
     working_dir = tempfile.mkdtemp(dir="/tmp")
@@ -112,9 +113,10 @@ def export_model(
     )
 
     # TODO: move to test infra
-    runtime.run_and_compare(
-        executorch_prog=exec_prog,
-        inputs=example_inputs,
-        ref_outputs=ref_outputs,
-        working_dir=working_dir,
-    )
+    if run_and_compare:
+        runtime.run_and_compare(
+            executorch_prog=exec_prog,
+            inputs=example_inputs,
+            ref_outputs=ref_outputs,
+            working_dir=working_dir,
+        )
diff --git a/backends/cadence/aot/functions_hifi.yaml b/backends/cadence/aot/functions_hifi.yaml
index cf234c22c0..b6a2c50001 100644
--- a/backends/cadence/aot/functions_hifi.yaml
+++ b/backends/cadence/aot/functions_hifi.yaml
@@ -77,10 +77,20 @@
     - arg_meta: null
       kernel_name: torch::executor::max_pool2d_with_indices_out
 
+- op: maximum.out
+  kernels:
+    - arg_meta: null
+      kernel_name: cadence::impl::HiFi::maximum_out
+
 - op: mean.out
   kernels:
     - arg_meta: null
-      kernel_name: cadence::impl::HiFi::mean_dim_out
+      kernel_name: cadence::impl::HiFi::mean_dim_out   
+
+- op: minimum.out
+  kernels:
+    - arg_meta: null
+      kernel_name: cadence::impl::HiFi::minimum_out
 
 - op: mul.out
   kernels:
@@ -92,6 +102,26 @@
     - arg_meta: null
       kernel_name: torch::executor::permute_copy_out
 
+- op: pow.Scalar_out
+  kernels:
+    - arg_meta: null
+      kernel_name: cadence::impl::HiFi::pow_Scalar_out
+
+- op: pow.Tensor_Scalar_out
+  kernels:
+    - arg_meta: null
+      kernel_name: cadence::impl::HiFi::pow_Tensor_Scalar_out
+
+- op: pow.Tensor_Tensor_out
+  kernels:
+    - arg_meta: null
+      kernel_name: cadence::impl::HiFi::pow_Tensor_Tensor_out
+
+- op: rsqrt.out
+  kernels:
+    - arg_meta: null
+      kernel_name: cadence::impl::HiFi::rsqrt_out
+
 - op: sigmoid.out
   kernels:
     - arg_meta: null
diff --git a/backends/cadence/aot/utils.py b/backends/cadence/aot/utils.py
index e8b64ef567..534b4f0d9f 100644
--- a/backends/cadence/aot/utils.py
+++ b/backends/cadence/aot/utils.py
@@ -162,7 +162,8 @@ def print_ops_info(
 
     # Print the final ops and their counts in a tabular format
     logging.info(
-        tabulate(
+        "\n"
+        + tabulate(
             sorted_ops_count,
             headers=[
                 "Final Operators                                    ",  # one character longer than the longest op name
diff --git a/backends/cadence/fusion_g3/operators/op_add.cpp b/backends/cadence/fusion_g3/operators/op_add.cpp
index 6dc710ce6e..9537cbacb7 100644
--- a/backends/cadence/fusion_g3/operators/op_add.cpp
+++ b/backends/cadence/fusion_g3/operators/op_add.cpp
@@ -76,27 +76,45 @@ Tensor& add_out(
   int inp2_shape[kTensorDimensionLimit];
   int out_shape[kTensorDimensionLimit];
 
-  /* input shapes and output shapes */
-  for (auto i = 0; i < a_size.size(); i++) {
-    inp1_shape[i] = a_size[i];
+  /*find broadcast*/
+  const bool a_is_broadcasted = !out.sizes().equals(a.sizes());
+  const bool b_is_broadcasted = !out.sizes().equals(b.sizes());
+  const bool broadcast = (a_is_broadcasted || b_is_broadcasted);
+
+  int max_dim = a.dim() > b.dim() ? a.dim() : b.dim();
+  max_dim = out.dim() > max_dim ? out.dim() : max_dim;
+
+  bool optimized = 1;
+
+  if ((a.dim() == 0) || (b.dim() == 0)) {
+    optimized = 0;
   }
 
-  for (auto i = 0; i < b_size.size(); i++) {
-    inp2_shape[i] = b_size[i];
+  if ((broadcast == 1) && (max_dim > kTensorDimensionLimit)) {
+    optimized = 0;
   }
 
-  for (auto i = 0; i < out_size.size(); i++) {
-    out_shape[i] = out_size[i];
+  for (int i = 0; i < max_dim; i++) {
+    out_shape[i] = 1;
+    inp1_shape[i] = 1;
+    inp2_shape[i] = 1;
   }
 
-  /*find broadcast*/
-  const bool a_is_broadcasted = !out.sizes().equals(a.sizes());
-  const bool b_is_broadcasted = !out.sizes().equals(b.sizes());
-  const bool broadcast = (a_is_broadcasted || b_is_broadcasted);
+  int offset_out = max_dim - out.dim();
+  int offset_inp1 = max_dim - a.dim();
+  int offset_inp2 = max_dim - b.dim();
 
-  int max_dim = a.dim() > b.dim() ? a.dim() : b.dim();
+  for (int i = 0; i < out.dim(); i++) {
+    out_shape[i + offset_out] = out.size(i);
+  }
+  for (int i = 0; i < a.dim(); i++) {
+    inp1_shape[i + offset_inp1] = a.size(i);
+  }
+  for (int i = 0; i < b.dim(); i++) {
+    inp2_shape[i + offset_inp2] = b.size(i);
+  }
 
-  if (compute_type == ScalarType::Int) {
+  if ((compute_type == ScalarType::Int) && (optimized)) {
     const int* const inp1_data = a.const_data_ptr<int>();
     const int* const inp2_data = b.const_data_ptr<int>();
     int* const out_data = out.mutable_data_ptr<int>();
@@ -117,7 +135,7 @@ Tensor& add_out(
       xa_nn_elm_add_32x32_32(
           out_data, inp1_data, inp2_data, alpha_val, out.numel());
     }
-  } else if (compute_type == ScalarType::Float) {
+  } else if ((compute_type == ScalarType::Float) && (optimized)) {
     const float* const inp1_data = a.const_data_ptr<float>();
     const float* const inp2_data = b.const_data_ptr<float>();
     float* const out_data = out.mutable_data_ptr<float>();
diff --git a/backends/cadence/fusion_g3/operators/op_mul.cpp b/backends/cadence/fusion_g3/operators/op_mul.cpp
index 366982ae3f..31cd50314e 100644
--- a/backends/cadence/fusion_g3/operators/op_mul.cpp
+++ b/backends/cadence/fusion_g3/operators/op_mul.cpp
@@ -68,27 +68,45 @@ Tensor& mul_out(
   int inp2_shape[kTensorDimensionLimit];
   int out_shape[kTensorDimensionLimit];
 
-  /* input shapes and output shapes */
-  for (auto i = 0; i < a_size.size(); i++) {
-    inp1_shape[i] = a_size[i];
+  /*find broadcast*/
+  const bool a_is_broadcasted = !out.sizes().equals(a.sizes());
+  const bool b_is_broadcasted = !out.sizes().equals(b.sizes());
+  const bool broadcast = (a_is_broadcasted || b_is_broadcasted);
+
+  int max_dim = a.dim() > b.dim() ? a.dim() : b.dim();
+  max_dim = out.dim() > max_dim ? out.dim() : max_dim;
+
+  bool optimized = 1;
+
+  if ((a.dim() == 0) || (b.dim() == 0)) {
+    optimized = 0;
   }
 
-  for (auto i = 0; i < b_size.size(); i++) {
-    inp2_shape[i] = b_size[i];
+  if ((broadcast == 1) && (max_dim > kTensorDimensionLimit)) {
+    optimized = 0;
   }
 
-  for (auto i = 0; i < out_size.size(); i++) {
-    out_shape[i] = out_size[i];
+  for (int i = 0; i < max_dim; i++) {
+    out_shape[i] = 1;
+    inp1_shape[i] = 1;
+    inp2_shape[i] = 1;
   }
 
-  /*find broadcast*/
-  const bool a_is_broadcasted = !out.sizes().equals(a.sizes());
-  const bool b_is_broadcasted = !out.sizes().equals(b.sizes());
-  const bool broadcast = (a_is_broadcasted || b_is_broadcasted);
+  int offset_out = max_dim - out.dim();
+  int offset_inp1 = max_dim - a.dim();
+  int offset_inp2 = max_dim - b.dim();
 
-  int max_dim = a.dim() > b.dim() ? a.dim() : b.dim();
+  for (int i = 0; i < out.dim(); i++) {
+    out_shape[i + offset_out] = out.size(i);
+  }
+  for (int i = 0; i < a.dim(); i++) {
+    inp1_shape[i + offset_inp1] = a.size(i);
+  }
+  for (int i = 0; i < b.dim(); i++) {
+    inp2_shape[i + offset_inp2] = b.size(i);
+  }
 
-  if (compute_type == ScalarType::Int) {
+  if ((compute_type == ScalarType::Int) && (optimized)) {
     const int* const inp1_data = a.const_data_ptr<int>();
     const int* const inp2_data = b.const_data_ptr<int>();
     int* const out_data = out.mutable_data_ptr<int>();
@@ -105,7 +123,7 @@ Tensor& mul_out(
     } else {
       xa_nn_elm_mul_32x32_32(out_data, inp1_data, inp2_data, out.numel());
     }
-  } else if (compute_type == ScalarType::Float) {
+  } else if ((compute_type == ScalarType::Float) && (optimized)) {
     const float* const inp1_data = a.const_data_ptr<float>();
     const float* const inp2_data = b.const_data_ptr<float>();
     float* const out_data = out.mutable_data_ptr<float>();
diff --git a/backends/cadence/hifi/kernels/CMakeLists.txt b/backends/cadence/hifi/kernels/CMakeLists.txt
index 9321cc544e..3d321443f8 100644
--- a/backends/cadence/hifi/kernels/CMakeLists.txt
+++ b/backends/cadence/hifi/kernels/CMakeLists.txt
@@ -9,10 +9,13 @@ add_library(
   cadence_kernels
   kernels.cpp
   ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/matmul_asym8uxasym8u_asym8u.cpp
+  ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_broadcast_32.c
   ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_add_f32_broadcast.c
   ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_div_f32_broadcast.c
   ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_div_mode_f32_broadcast.c
+  ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_minimum_maximum_f32.c
   ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_mul_f32_broadcast.c
+  ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_pow_f32.c
   ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_where_f32xf32_f32.c
   ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_reduce_32_32.c
 )
diff --git a/backends/cadence/hifi/kernels/kernels.h b/backends/cadence/hifi/kernels/kernels.h
index 2c915661f8..10927adc2a 100644
--- a/backends/cadence/hifi/kernels/kernels.h
+++ b/backends/cadence/hifi/kernels/kernels.h
@@ -15,6 +15,14 @@
 #include "xa_nnlib_kernels_api.h"
 
 /* Potential NNLIB function/APIs */
+
+extern "C" WORD32 xa_nn_broadcast_32_32(
+    WORD32* __restrict__ p_out,
+    const int* const out_shape,
+    WORD32* __restrict__ p_in,
+    const int* const in_shape,
+    int num_dims);
+
 extern "C" WORD32 xa_nn_elm_add_broadcast_4D_f32xf32_f32(
     FLOAT32* __restrict__ p_out,
     const WORD32* const p_out_shape,
@@ -47,6 +55,34 @@ extern "C" WORD32 xa_nn_elm_div_mode_broadcast_4D_f32xf32_f32(
     const WORD32* const p_inp2_shape,
     WORD32 mode);
 
+extern "C" WORD32 xa_nn_elm_maximum_f32xf32_f32(
+    FLOAT32* __restrict__ p_out,
+    const FLOAT32* __restrict__ p_inp1,
+    const FLOAT32* __restrict__ p_inp2,
+    WORD32 num_elm);
+
+extern "C" WORD32 xa_nn_elm_maximum_broadcast_4D_f32xf32_f32(
+    FLOAT32* __restrict__ p_out,
+    const WORD32* const p_out_shape,
+    const FLOAT32* __restrict__ p_inp1,
+    const WORD32* const p_inp1_shape,
+    const FLOAT32* __restrict__ p_inp2,
+    const WORD32* const p_inp2_shape);
+
+extern "C" WORD32 xa_nn_elm_minimum_f32xf32_f32(
+    FLOAT32* __restrict__ p_out,
+    const FLOAT32* __restrict__ p_inp1,
+    const FLOAT32* __restrict__ p_inp2,
+    WORD32 num_elm);
+
+extern "C" WORD32 xa_nn_elm_minimum_broadcast_4D_f32xf32_f32(
+    FLOAT32* __restrict__ p_out,
+    const WORD32* const p_out_shape,
+    const FLOAT32* __restrict__ p_inp1,
+    const WORD32* const p_inp1_shape,
+    const FLOAT32* __restrict__ p_inp2,
+    const WORD32* const p_inp2_shape);
+
 extern "C" WORD32 xa_nn_elm_mul_broadcast_4D_f32xf32_f32(
     FLOAT32* __restrict__ p_out,
     const WORD32* const p_out_shape,
@@ -55,6 +91,12 @@ extern "C" WORD32 xa_nn_elm_mul_broadcast_4D_f32xf32_f32(
     const FLOAT32* __restrict__ p_inp2,
     const WORD32* const p_inp2_shape);
 
+extern "C" void xa_nn_elm_pow_f32(
+    FLOAT32* restrict z,
+    const FLOAT32* restrict x,
+    const FLOAT32* restrict y,
+    WORD32 N);
+
 extern "C" WORD32 xa_nn_elm_where_f32xf32_f32(
     FLOAT32* __restrict__ p_out,
     const FLOAT32* __restrict__ p_inp1,
diff --git a/backends/cadence/hifi/operators/CMakeLists.txt b/backends/cadence/hifi/operators/CMakeLists.txt
index fc00345465..5e51f7fd3b 100644
--- a/backends/cadence/hifi/operators/CMakeLists.txt
+++ b/backends/cadence/hifi/operators/CMakeLists.txt
@@ -22,8 +22,12 @@ endif()
 set(_aten_ops__srcs
     "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_add.cpp"
     "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_div.cpp"
+    "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_maximum.cpp"
     "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_mean.cpp"
+    "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_minimum.cpp"
     "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_mul.cpp"
+    "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_pow.cpp"
+    "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_rsqrt.cpp"
     "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_sigmoid.cpp"
     "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_sub.cpp"
     "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_tanh.cpp"
diff --git a/backends/cadence/hifi/operators/op_maximum.cpp b/backends/cadence/hifi/operators/op_maximum.cpp
new file mode 100644
index 0000000000..f85d3470e9
--- /dev/null
+++ b/backends/cadence/hifi/operators/op_maximum.cpp
@@ -0,0 +1,174 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/cadence/hifi/kernels/kernels.h>
+#include <executorch/kernels/portable/cpu/scalar_utils.h>
+#include <executorch/kernels/portable/cpu/util/broadcast_util.h>
+#include <executorch/kernels/portable/cpu/util/math_util.h>
+#include <executorch/runtime/kernel/kernel_includes.h>
+
+using exec_aten::ScalarType;
+using exec_aten::Tensor;
+using executorch::aten::RuntimeContext;
+using executorch::runtime::can_cast;
+using executorch::runtime::canCast;
+using executorch::runtime::CppTypeToScalarType;
+using executorch::runtime::promoteTypes;
+using torch::executor::apply_binary_elementwise_fn;
+using torch::executor::Error;
+using torch::executor::resize_to_broadcast_target_size;
+
+namespace cadence {
+namespace impl {
+namespace HiFi {
+namespace native {
+namespace {
+
+template <
+    bool can_cast,
+    typename CTYPE_A,
+    typename CTYPE_B,
+    typename CTYPE_IN,
+    typename CTYPE_OUT>
+struct MaximumInner;
+
+template <
+    typename CTYPE_A,
+    typename CTYPE_B,
+    typename CTYPE_IN,
+    typename CTYPE_OUT>
+struct MaximumInner<true, CTYPE_A, CTYPE_B, CTYPE_IN, CTYPE_OUT> {
+  static void run(const Tensor& a, const Tensor& b, Tensor& out) {
+    apply_binary_elementwise_fn<CTYPE_A, CTYPE_B, CTYPE_OUT>(
+        // NOLINTNEXTLINE(facebook-hte-ConstantArgumentPassByValue)
+        [](const CTYPE_A val_a, const CTYPE_B val_b) {
+          CTYPE_IN a_casted = static_cast<CTYPE_IN>(val_a);
+          CTYPE_IN b_casted = static_cast<CTYPE_IN>(val_b);
+          CTYPE_IN value =
+              torch::executor::native::utils::max_override(a_casted, b_casted);
+
+          return static_cast<CTYPE_OUT>(value);
+        },
+        a,
+        b,
+        out);
+  }
+};
+
+struct ReportCanCastBug {
+  static void run(const Tensor&, const Tensor&, Tensor&) {
+    ET_DCHECK_MSG(false, "BUG: canCast should have been checked above");
+  }
+};
+
+template <
+    typename CTYPE_A,
+    typename CTYPE_B,
+    typename CTYPE_IN,
+    typename CTYPE_OUT>
+struct MaximumInner<false, CTYPE_A, CTYPE_B, CTYPE_IN, CTYPE_OUT>
+    : public ReportCanCastBug {};
+
+} // namespace
+
+Tensor& maximum_out(
+    RuntimeContext& ctx,
+    const Tensor& a,
+    const Tensor& b,
+    Tensor& out) {
+  (void)ctx;
+
+  ET_KERNEL_CHECK(
+      ctx,
+      resize_to_broadcast_target_size(a, b, out) == Error::Ok,
+      InvalidArgument,
+      out);
+
+  constexpr int kNnlibMaxDim = 4; /*fallback if broadcast and dim > 4 */
+
+  ScalarType a_type = a.scalar_type();
+  ScalarType b_type = b.scalar_type();
+  ScalarType common_type = promoteTypes(a_type, b_type, /*half_to_float*/ true);
+  ScalarType out_type = out.scalar_type();
+
+  ET_KERNEL_CHECK(ctx, canCast(common_type, out_type), InvalidArgument, out);
+
+  bool optimized = true;
+  /*find broadcast*/
+  bool a_is_broadcasted = !out.sizes().equals(a.sizes());
+  bool b_is_broadcasted = !out.sizes().equals(b.sizes());
+  bool broadcast = (a_is_broadcasted || b_is_broadcasted);
+
+  int max_dim = a.dim() > b.dim() ? a.dim() : b.dim();
+  max_dim = out.dim() > max_dim ? out.dim() : max_dim;
+
+  if ((a_type != ScalarType::Float) || (b_type != ScalarType::Float))
+    optimized = false;
+  if ((broadcast == true) && (max_dim > kNnlibMaxDim))
+    optimized = false;
+
+  if (optimized) {
+    float* a_data = a.mutable_data_ptr<float>();
+    float* b_data = b.mutable_data_ptr<float>();
+    float* out_data = out.mutable_data_ptr<float>();
+
+    if (broadcast == true) {
+      int out_shape[kNnlibMaxDim];
+      int inp1_shape[kNnlibMaxDim];
+      int inp2_shape[kNnlibMaxDim];
+
+      for (int i = 0; i < kNnlibMaxDim; i++) {
+        out_shape[i] = 1;
+        inp1_shape[i] = 1;
+        inp2_shape[i] = 1;
+      }
+
+      int off_o = kNnlibMaxDim - out.dim();
+      int off_a = kNnlibMaxDim - a.dim();
+      int off_b = kNnlibMaxDim - b.dim();
+
+      for (int i = 0; i < out.dim(); i++) {
+        out_shape[i + off_o] = out.size(i);
+      }
+
+      for (int i = 0; i < a.dim(); i++)
+        inp1_shape[i + off_a] = a.size(i);
+
+      for (int i = 0; i < b.dim(); i++)
+        inp2_shape[i + off_b] = b.size(i);
+
+      xa_nn_elm_maximum_broadcast_4D_f32xf32_f32(
+          out_data, out_shape, a_data, inp1_shape, b_data, inp2_shape);
+    } else {
+      xa_nn_elm_maximum_f32xf32_f32(out_data, a_data, b_data, out.numel());
+    }
+    return out;
+  }
+  ET_SWITCH_REALHB_TYPES(a_type, ctx, "maximum.out", CTYPE_A, [&]() {
+    ET_SWITCH_REALHB_TYPES(b_type, ctx, "maximum.out", CTYPE_B, [&]() {
+      using CTYPE_IN = typename torch::executor::
+          promote_types<CTYPE_A, CTYPE_B, /*half_to_float*/ true>::type;
+      ET_DCHECK(CppTypeToScalarType<CTYPE_IN>::value == common_type);
+      ET_SWITCH_REALHB_TYPES(out_type, ctx, "maximum.out", CTYPE_OUT, [&]() {
+        MaximumInner<
+            can_cast<CTYPE_IN, CTYPE_OUT>::value,
+            CTYPE_A,
+            CTYPE_B,
+            CTYPE_IN,
+            CTYPE_OUT>::run(a, b, out);
+      });
+    });
+  });
+
+  return out;
+}
+
+} // namespace native
+} // namespace HiFi
+} // namespace impl
+} // namespace cadence
diff --git a/backends/cadence/hifi/operators/op_minimum.cpp b/backends/cadence/hifi/operators/op_minimum.cpp
new file mode 100644
index 0000000000..6f81ad5c3e
--- /dev/null
+++ b/backends/cadence/hifi/operators/op_minimum.cpp
@@ -0,0 +1,173 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/cadence/hifi/kernels/kernels.h>
+#include <executorch/kernels/portable/cpu/scalar_utils.h>
+#include <executorch/kernels/portable/cpu/util/broadcast_util.h>
+#include <executorch/kernels/portable/cpu/util/math_util.h>
+#include <executorch/runtime/kernel/kernel_includes.h>
+
+using exec_aten::ScalarType;
+using exec_aten::Tensor;
+using executorch::aten::RuntimeContext;
+using executorch::runtime::can_cast;
+using executorch::runtime::canCast;
+using executorch::runtime::CppTypeToScalarType;
+using executorch::runtime::promoteTypes;
+using torch::executor::apply_binary_elementwise_fn;
+using torch::executor::Error;
+using torch::executor::resize_to_broadcast_target_size;
+
+namespace cadence {
+namespace impl {
+namespace HiFi {
+namespace native {
+namespace {
+
+template <
+    bool can_cast,
+    typename CTYPE_A,
+    typename CTYPE_B,
+    typename CTYPE_IN,
+    typename CTYPE_OUT>
+struct MinimumInner;
+
+template <
+    typename CTYPE_A,
+    typename CTYPE_B,
+    typename CTYPE_IN,
+    typename CTYPE_OUT>
+struct MinimumInner<true, CTYPE_A, CTYPE_B, CTYPE_IN, CTYPE_OUT> {
+  static void run(const Tensor& a, const Tensor& b, Tensor& out) {
+    apply_binary_elementwise_fn<CTYPE_A, CTYPE_B, CTYPE_OUT>(
+        // NOLINTNEXTLINE(facebook-hte-ConstantArgumentPassByValue)
+        [](const CTYPE_A val_a, const CTYPE_B val_b) {
+          CTYPE_IN a_casted = static_cast<CTYPE_IN>(val_a);
+          CTYPE_IN b_casted = static_cast<CTYPE_IN>(val_b);
+          CTYPE_IN value =
+              torch::executor::native::utils::min_override(a_casted, b_casted);
+
+          return static_cast<CTYPE_OUT>(value);
+        },
+        a,
+        b,
+        out);
+  }
+};
+
+struct ReportCanCastBug {
+  static void run(const Tensor&, const Tensor&, Tensor&) {
+    ET_DCHECK_MSG(false, "BUG: canCast should have been checked above");
+  }
+};
+
+template <
+    typename CTYPE_A,
+    typename CTYPE_B,
+    typename CTYPE_IN,
+    typename CTYPE_OUT>
+struct MinimumInner<false, CTYPE_A, CTYPE_B, CTYPE_IN, CTYPE_OUT>
+    : public ReportCanCastBug {};
+
+} // namespace
+
+Tensor& minimum_out(
+    RuntimeContext& ctx,
+    const Tensor& a,
+    const Tensor& b,
+    Tensor& out) {
+  (void)ctx;
+
+  ET_KERNEL_CHECK(
+      ctx,
+      resize_to_broadcast_target_size(a, b, out) == Error::Ok,
+      InvalidArgument,
+      out);
+
+  constexpr int kNnlibMaxDim = 4; /*fallback if broadcast and dim > 4 */
+
+  ScalarType a_type = a.scalar_type();
+  ScalarType b_type = b.scalar_type();
+  ScalarType common_type = promoteTypes(a_type, b_type, /*half_to_float*/ true);
+  ScalarType out_type = out.scalar_type();
+
+  ET_KERNEL_CHECK(ctx, canCast(common_type, out_type), InvalidArgument, out);
+
+  bool optimized = true;
+  /*find broadcast*/
+  const bool a_is_broadcasted = !out.sizes().equals(a.sizes());
+  const bool b_is_broadcasted = !out.sizes().equals(b.sizes());
+  const bool broadcast = (a_is_broadcasted || b_is_broadcasted);
+
+  int max_dim = a.dim() > b.dim() ? a.dim() : b.dim();
+  max_dim = out.dim() > max_dim ? out.dim() : max_dim;
+
+  if ((a_type != ScalarType::Float) || (b_type != ScalarType::Float))
+    optimized = false;
+  if ((broadcast == true) && (max_dim > kNnlibMaxDim))
+    optimized = false;
+
+  if (optimized) {
+    float* a_data = a.mutable_data_ptr<float>();
+    float* b_data = b.mutable_data_ptr<float>();
+    float* out_data = out.mutable_data_ptr<float>();
+
+    if (broadcast == true) {
+      int out_shape[kNnlibMaxDim];
+      int inp1_shape[kNnlibMaxDim];
+      int inp2_shape[kNnlibMaxDim];
+
+      for (int i = 0; i < kNnlibMaxDim; i++) {
+        out_shape[i] = 1;
+        inp1_shape[i] = 1;
+        inp2_shape[i] = 1;
+      }
+
+      int off_o = kNnlibMaxDim - out.dim();
+      int off_a = kNnlibMaxDim - a.dim();
+      int off_b = kNnlibMaxDim - b.dim();
+
+      for (int i = 0; i < out.dim(); i++) {
+        out_shape[i + off_o] = out.size(i);
+      }
+
+      for (int i = 0; i < a.dim(); i++)
+        inp1_shape[i + off_a] = a.size(i);
+
+      for (int i = 0; i < b.dim(); i++)
+        inp2_shape[i + off_b] = b.size(i);
+
+      xa_nn_elm_minimum_broadcast_4D_f32xf32_f32(
+          out_data, out_shape, a_data, inp1_shape, b_data, inp2_shape);
+    } else {
+      xa_nn_elm_minimum_f32xf32_f32(out_data, a_data, b_data, out.numel());
+    }
+    return out;
+  }
+  ET_SWITCH_REALHB_TYPES(a_type, ctx, "minimum.out", CTYPE_A, [&]() {
+    ET_SWITCH_REALHB_TYPES(b_type, ctx, "minimum.out", CTYPE_B, [&]() {
+      using CTYPE_IN = typename torch::executor::
+          promote_types<CTYPE_A, CTYPE_B, /*half_to_float*/ true>::type;
+      ET_DCHECK(CppTypeToScalarType<CTYPE_IN>::value == common_type);
+      ET_SWITCH_REALHB_TYPES(out_type, ctx, "minimum.out", CTYPE_OUT, [&]() {
+        MinimumInner<
+            can_cast<CTYPE_IN, CTYPE_OUT>::value,
+            CTYPE_A,
+            CTYPE_B,
+            CTYPE_IN,
+            CTYPE_OUT>::run(a, b, out);
+      });
+    });
+  });
+  return out;
+}
+
+} // namespace native
+} // namespace HiFi
+} // namespace impl
+} // namespace cadence
diff --git a/backends/cadence/hifi/operators/op_pow.cpp b/backends/cadence/hifi/operators/op_pow.cpp
new file mode 100644
index 0000000000..1399c24a34
--- /dev/null
+++ b/backends/cadence/hifi/operators/op_pow.cpp
@@ -0,0 +1,353 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <cmath>
+
+#include <executorch/backends/cadence/hifi/kernels/kernels.h>
+#include <executorch/kernels/portable/cpu/scalar_utils.h>
+#include <executorch/kernels/portable/cpu/util/broadcast_util.h>
+#include <executorch/kernels/portable/cpu/util/functional_util.h>
+#include <executorch/runtime/core/exec_aten/util/scalar_type_util.h>
+#include <executorch/runtime/kernel/kernel_includes.h>
+
+using exec_aten::Scalar;
+using exec_aten::ScalarType;
+using exec_aten::Tensor;
+using executorch::runtime::can_cast;
+using executorch::runtime::canCast;
+using executorch::runtime::CppTypeToScalarType;
+using executorch::runtime::KernelRuntimeContext;
+using executorch::runtime::promoteTypes;
+using torch::executor::Error;
+using torch::executor::resize_to_broadcast_target_size;
+
+namespace cadence {
+namespace impl {
+namespace HiFi {
+namespace native {
+
+namespace {
+template <
+    bool can_cast,
+    typename CTYPE_A,
+    typename CTYPE_B,
+    typename CTYPE_IN,
+    typename CTYPE_OUT>
+struct PowInner;
+
+template <
+    typename CTYPE_A,
+    typename CTYPE_B,
+    typename CTYPE_IN,
+    typename CTYPE_OUT>
+struct PowInner<true, CTYPE_A, CTYPE_B, CTYPE_IN, CTYPE_OUT> {
+  static void run(const Tensor& a, const Tensor& b, Tensor& out) {
+    torch::executor::apply_binary_elementwise_fn<CTYPE_A, CTYPE_B, CTYPE_OUT>(
+        // NOLINTNEXTLINE(facebook-hte-ConstantArgumentPassByValue)
+        [](const CTYPE_A val_a, const CTYPE_B val_b) {
+          CTYPE_IN a_casted = static_cast<CTYPE_IN>(val_a);
+          CTYPE_IN b_casted = static_cast<CTYPE_IN>(val_b);
+          CTYPE_IN value = std::pow(a_casted, b_casted);
+          return static_cast<CTYPE_OUT>(value);
+        },
+        a,
+        b,
+        out);
+  }
+};
+
+struct ReportCanCastBug {
+  static void run(const Tensor&, const Tensor&, Tensor&) {
+    ET_DCHECK_MSG(false, "BUG: canCast should have been checked above");
+  }
+};
+
+template <
+    typename CTYPE_A,
+    typename CTYPE_B,
+    typename CTYPE_IN,
+    typename CTYPE_OUT>
+struct PowInner<false, CTYPE_A, CTYPE_B, CTYPE_IN, CTYPE_OUT>
+    : public ReportCanCastBug {};
+
+} // namespace
+
+Tensor& pow_Tensor_Tensor_out(
+    KernelRuntimeContext& ctx,
+    const Tensor& a,
+    const Tensor& b,
+    Tensor& out) {
+  // Determine output size and resize for dynamic shapes
+  ET_KERNEL_CHECK(
+      ctx,
+      resize_to_broadcast_target_size(a, b, out) == Error::Ok,
+      InvalidArgument,
+      out);
+
+  ScalarType a_type = a.scalar_type();
+  ScalarType b_type = b.scalar_type();
+  ScalarType common_type = promoteTypes(a_type, b_type, /*half_to_float*/ true);
+  ScalarType out_type = out.scalar_type();
+
+  ET_KERNEL_CHECK(
+      ctx, common_type != exec_aten::ScalarType::Bool, InvalidArgument, out);
+  ET_KERNEL_CHECK(ctx, canCast(common_type, out_type), InvalidArgument, out);
+
+  constexpr auto name = "pow.Tensor_Tensor_out";
+  constexpr int kNnlibMaxDim = 16;
+  int a_dim = a.dim(), b_dim = b.dim(), out_dim = out.dim();
+  bool optimized = true;
+
+  const bool a_is_broadcasted = !out.sizes().equals(a.sizes());
+  const bool b_is_broadcasted = !out.sizes().equals(b.sizes());
+  const bool broadcast = (a_is_broadcasted && b_is_broadcasted);
+  int max_dim = a.dim() > b.dim() ? a.dim() : b.dim();
+  max_dim = out.dim() > max_dim ? out.dim() : max_dim;
+
+  if (out_type != ScalarType::Float)
+    optimized = false;
+
+  if (max_dim > kNnlibMaxDim)
+    optimized = false;
+
+  WORD32 num_elm = out.numel();
+
+  if (optimized) {
+    if (broadcast) {
+      WORD32* __restrict__ ptr1 =
+          (WORD32* __restrict__)malloc(num_elm * sizeof(WORD32));
+      WORD32* __restrict__ ptr2 =
+          (WORD32* __restrict__)malloc(num_elm * sizeof(WORD32));
+
+      WORD32* __restrict__ pin1 =
+          (WORD32* __restrict__)a.const_data_ptr<float>();
+      WORD32* __restrict__ pin2 =
+          (WORD32* __restrict__)b.const_data_ptr<float>();
+
+      WORD32 p_out_shape[kNnlibMaxDim];
+      WORD32 p_inp1_shape[kNnlibMaxDim];
+      WORD32 p_inp2_shape[kNnlibMaxDim];
+
+      for (int i = 0; i < out_dim; i++)
+        p_out_shape[i] = out.size(i);
+      for (int i = 0; i < a_dim; i++)
+        p_inp1_shape[i] = a.size(i);
+      for (int i = 0; i < b_dim; i++)
+        p_inp2_shape[i] = b.size(i);
+
+      xa_nn_broadcast_32_32(ptr1, p_out_shape, pin1, p_inp1_shape, out_dim);
+
+      xa_nn_broadcast_32_32(ptr2, p_out_shape, pin2, p_inp2_shape, out_dim);
+
+      FLOAT32* __restrict__ p_out =
+          (FLOAT32* __restrict__)out.mutable_data_ptr<float>();
+      const FLOAT32* __restrict__ p_inp1 = (const FLOAT32* __restrict__)ptr1;
+      const FLOAT32* __restrict__ p_inp2 = (const FLOAT32* __restrict__)ptr2;
+
+      xa_nn_elm_pow_f32(p_out, p_inp1, p_inp2, num_elm);
+
+      free(ptr1);
+      free(ptr2);
+    } else if (a_is_broadcasted && (!b_is_broadcasted)) {
+      FLOAT32* __restrict__ ptr1 =
+          (FLOAT32* __restrict__)malloc((num_elm + 2) * sizeof(WORD32));
+
+      FLOAT32* __restrict__ pin1 =
+          (FLOAT32* __restrict__)a.const_data_ptr<float>();
+
+      WORD32 p_out_shape[kNnlibMaxDim];
+      WORD32 p_inp1_shape[kNnlibMaxDim];
+
+      for (int i = 0; i < out_dim; i++)
+        p_out_shape[i] = out.size(i);
+      for (int i = 0; i < a_dim; i++)
+        p_inp1_shape[i] = a.size(i);
+
+      xa_nn_broadcast_32_32(
+          (WORD32*)ptr1, p_out_shape, (WORD32*)pin1, p_inp1_shape, out_dim);
+
+      FLOAT32* __restrict__ p_out =
+          (FLOAT32* __restrict__)out.mutable_data_ptr<float>();
+      const FLOAT32* __restrict__ p_inp1 = (const FLOAT32* __restrict__)ptr1;
+      const FLOAT32* __restrict__ p_inp2 =
+          (const FLOAT32* __restrict__)b.const_data_ptr<float>();
+
+      xa_nn_elm_pow_f32(p_out, p_inp1, p_inp2, num_elm);
+
+      free(ptr1);
+    } else if (b_is_broadcasted && (!a_is_broadcasted)) {
+      WORD32* __restrict__ ptr1 =
+          (WORD32* __restrict__)malloc(num_elm * sizeof(WORD32));
+
+      WORD32* __restrict__ pin1 =
+          (WORD32* __restrict__)b.const_data_ptr<float>();
+
+      WORD32 p_out_shape[kNnlibMaxDim];
+      WORD32 p_inp1_shape[kNnlibMaxDim];
+
+      for (int i = 0; i < out_dim; i++)
+        p_out_shape[i] = out.size(i);
+      for (int i = 0; i < b_dim; i++)
+        p_inp1_shape[i] = b.size(i);
+
+      xa_nn_broadcast_32_32(ptr1, p_out_shape, pin1, p_inp1_shape, out_dim);
+
+      FLOAT32* __restrict__ p_out =
+          (FLOAT32* __restrict__)out.mutable_data_ptr<float>();
+      const FLOAT32* __restrict__ p_inp1 =
+          (const FLOAT32* __restrict__)a.const_data_ptr<float>();
+      const FLOAT32* __restrict__ p_inp2 = (const FLOAT32* __restrict__)ptr1;
+
+      xa_nn_elm_pow_f32(p_out, p_inp1, p_inp2, num_elm);
+
+      free(ptr1);
+    } else {
+      FLOAT32* __restrict__ p_out =
+          (FLOAT32* __restrict__)out.mutable_data_ptr<float>();
+      const FLOAT32* __restrict__ p_inp1 =
+          (const FLOAT32* __restrict__)a.const_data_ptr<float>();
+      const FLOAT32* __restrict__ p_inp2 =
+          (const FLOAT32* __restrict__)b.const_data_ptr<float>();
+
+      xa_nn_elm_pow_f32(p_out, p_inp1, p_inp2, num_elm);
+    }
+    return out;
+  }
+
+  ET_SWITCH_REALHB_TYPES(a_type, ctx, name, CTYPE_A, [&]() {
+    ET_SWITCH_REALHB_TYPES(b_type, ctx, name, CTYPE_B, [&]() {
+      using CTYPE_IN = typename torch::executor::
+          promote_types<CTYPE_A, CTYPE_B, /*half_to_float*/ true>::type;
+      ET_DCHECK(CppTypeToScalarType<CTYPE_IN>::value == common_type);
+      ET_SWITCH_REALH_TYPES(out_type, ctx, name, CTYPE_OUT, [&]() {
+        PowInner<
+            !std::is_same<CTYPE_IN, bool>::value &&
+                can_cast<CTYPE_IN, CTYPE_OUT>::value,
+            CTYPE_A,
+            CTYPE_B,
+            CTYPE_IN,
+            CTYPE_OUT>::run(a, b, out);
+      });
+    });
+  });
+
+  return out;
+}
+
+Tensor& pow_Tensor_Scalar_out(
+    KernelRuntimeContext& ctx,
+    const Tensor& a,
+    const Scalar& b,
+    Tensor& out) {
+  (void)ctx;
+
+  // Resize for dynamic shape
+  ET_KERNEL_CHECK_MSG(
+      ctx,
+      resize_tensor(out, a.sizes()) == Error::Ok,
+      InvalidArgument,
+      out,
+      "Failed to resize output tensor.");
+
+  ScalarType a_type = a.scalar_type();
+  ScalarType b_type = torch::executor::native::utils::get_scalar_dtype(b);
+  ScalarType common_type =
+      torch::executor::native::utils::promote_type_with_scalar(
+          a_type, b, /*half_to_float*/ false);
+  ScalarType out_type = out.scalar_type();
+
+  ET_KERNEL_CHECK(ctx, common_type == out_type, InvalidArgument, out);
+
+  constexpr auto name = "pow.Tensor_Scalar_out";
+  if (common_type == ScalarType::Half) {
+    common_type = ScalarType::Float;
+  }
+
+  ET_SWITCH_REALHB_TYPES(a_type, ctx, name, CTYPE_A, [&]() {
+    ET_SWITCH_SCALAR_OBJ_TYPES(b_type, ctx, name, CTYPE_B, [&]() {
+      ET_SWITCH_REAL_TYPES(common_type, ctx, name, CTYPE_IN, [&]() {
+        ET_SWITCH_REALH_TYPES(out_type, ctx, name, CTYPE_OUT, [&]() {
+          CTYPE_B val_b = 0;
+          torch::executor::native::utils::extract_scalar(b, &val_b);
+          torch::executor::apply_unary_map_fn(
+              [val_b](const CTYPE_A val_a) {
+                CTYPE_IN a_casted = static_cast<CTYPE_IN>(val_a);
+                CTYPE_IN b_casted = static_cast<CTYPE_IN>(val_b);
+                CTYPE_IN value = std::pow(a_casted, b_casted);
+
+                return static_cast<CTYPE_OUT>(value);
+              },
+              a.const_data_ptr<CTYPE_A>(),
+              out.mutable_data_ptr<CTYPE_OUT>(),
+              out.numel());
+        });
+      });
+    });
+  });
+
+  return out;
+}
+
+Tensor& pow_Scalar_out(
+    KernelRuntimeContext& ctx,
+    const Scalar& a,
+    const Tensor& b,
+    Tensor& out) {
+  (void)ctx;
+
+  // Resize for dynamic shape
+  ET_KERNEL_CHECK_MSG(
+      ctx,
+      resize_tensor(out, b.sizes()) == Error::Ok,
+      InvalidArgument,
+      out,
+      "Failed to resize output tensor.");
+
+  ScalarType a_type = torch::executor::native::utils::get_scalar_dtype(a);
+  ScalarType b_type = b.scalar_type();
+  ScalarType common_type =
+      torch::executor::native::utils::promote_type_with_scalar(
+          b_type, a, /*half_to_float*/ false);
+  ScalarType out_type = out.scalar_type();
+
+  ET_KERNEL_CHECK(ctx, common_type == out_type, InvalidArgument, out);
+
+  constexpr auto name = "pow.Scalar_out";
+  if (common_type == ScalarType::Half) {
+    common_type = ScalarType::Float;
+  }
+
+  ET_SWITCH_SCALAR_OBJ_TYPES(a_type, ctx, name, CTYPE_A, [&]() {
+    ET_SWITCH_REALHB_TYPES(b_type, ctx, name, CTYPE_B, [&]() {
+      ET_SWITCH_REAL_TYPES(common_type, ctx, name, CTYPE_IN, [&]() {
+        ET_SWITCH_REALH_TYPES(out_type, ctx, name, CTYPE_OUT, [&]() {
+          CTYPE_A val_a = 0;
+          torch::executor::native::utils::extract_scalar(a, &val_a);
+
+          torch::executor::apply_unary_map_fn(
+              [val_a](const CTYPE_B val_b) {
+                CTYPE_IN a_casted = static_cast<CTYPE_IN>(val_a);
+                CTYPE_IN b_casted = static_cast<CTYPE_IN>(val_b);
+                CTYPE_IN value = std::pow(a_casted, b_casted);
+                return static_cast<CTYPE_OUT>(value);
+              },
+              b.const_data_ptr<CTYPE_B>(),
+              out.mutable_data_ptr<CTYPE_OUT>(),
+              out.numel());
+        });
+      });
+    });
+  });
+
+  return out;
+}
+
+} // namespace native
+} // namespace HiFi
+} // namespace impl
+} // namespace cadence
diff --git a/backends/cadence/hifi/operators/op_rsqrt.cpp b/backends/cadence/hifi/operators/op_rsqrt.cpp
new file mode 100644
index 0000000000..1cf717988a
--- /dev/null
+++ b/backends/cadence/hifi/operators/op_rsqrt.cpp
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/kernels/portable/cpu/pattern/pattern.h>
+#include <executorch/runtime/kernel/kernel_includes.h>
+
+#include <executorch/backends/cadence/hifi/kernels/kernels.h>
+
+using exec_aten::ScalarType;
+using exec_aten::Tensor;
+using executorch::aten::RuntimeContext;
+
+namespace cadence {
+namespace impl {
+namespace HiFi {
+namespace native {
+namespace {
+
+double rsqrt(double x) {
+  return 1.0 / std::sqrt(x);
+}
+
+} // namespace
+
+Tensor& rsqrt_out(RuntimeContext& ctx, const Tensor& in, Tensor& out) {
+  bool optimized = true;
+
+  if (out.scalar_type() != ScalarType::Float)
+    optimized = false;
+
+  if (optimized) {
+    WORD32 num_elm = out.numel();
+
+    FLOAT32* __restrict__ p_out =
+        (FLOAT32* __restrict__)out.mutable_data_ptr<float>();
+    const FLOAT32* __restrict__ p_inp =
+        (const FLOAT32* __restrict__)in.const_data_ptr<float>();
+
+    xa_nn_elm_rsqrt_f32_f32(p_out, p_inp, num_elm);
+    return out;
+  }
+
+  return torch::executor::native::internal::
+      unary_ufunc_realhbbf16_to_floathbf16(rsqrt, ctx, in, out);
+}
+
+} // namespace native
+} // namespace HiFi
+} // namespace impl
+} // namespace cadence
diff --git a/backends/cadence/hifi/operators/quantized_linear_out.cpp b/backends/cadence/hifi/operators/quantized_linear_out.cpp
index 0f56a1a963..b8e1d117fb 100644
--- a/backends/cadence/hifi/operators/quantized_linear_out.cpp
+++ b/backends/cadence/hifi/operators/quantized_linear_out.cpp
@@ -26,6 +26,8 @@ using ::executorch::aten::Tensor;
 using ::executorch::runtime::getLeadingDims;
 using ::executorch::runtime::KernelRuntimeContext;
 
+// The nnlib kernel to compute quantized linear via matmul.
+
 void _quantized_linear_asym8u(
     const Tensor& in,
     const Tensor& weight,
@@ -37,37 +39,30 @@ void _quantized_linear_asym8u(
     int64_t out_zero_point,
     __ET_UNUSED const optional<Tensor>& offset,
     Tensor& out) {
-  // input comes in shape [leading_dims, in_dim]
-  // weight comes in shape [out_dim, in_dim]
-  // output comes in empty with shape [leading_dims, out_dim]
-  // Perform matrix multiply (M x N) x (N x P)' => M x P
   const int64_t leading_dims = getLeadingDims(in, in.dim() - 1);
   const int64_t out_dim = weight.size(0); // = out_dim
   const int64_t in_dim = weight.size(1); // = in_dim
-
   const uint8_t* __restrict__ in_data = in.const_data_ptr<uint8_t>();
   const uint8_t* __restrict__ weight_data = weight.const_data_ptr<uint8_t>();
   const int32_t* __restrict__ bias_data = bias.const_data_ptr<int32_t>();
   uint8_t* __restrict__ out_data = out.mutable_data_ptr<uint8_t>();
-
-  // The nnlib kernel to compute quantized linear via matmul.
   int32_t ret = xa_nn_matmul_asym8uxasym8u_asym8u(
-      out_data, // p_out
-      weight_data, // p_mat1,
-      in_data, // p_mat2,
-      bias_data, // p_bias
-      out_dim, // rows of p_mat1
-      in_dim, // cols of p_mat1
-      in_dim, // row_stride of p_mat1
-      leading_dims, // vec_count, i.e., rows of p_mat2
-      in_dim, // vec_offset of p_mat2.
-      out_dim, // out_offset, i.e., offset of next output element written
-      1, // out_stride, i.e., stride to go to next output row
+      out_data,
+      weight_data,
+      in_data,
+      bias_data,
+      out_dim,
+      in_dim,
+      in_dim,
+      leading_dims,
+      in_dim,
+      out_dim,
+      1,
       -weight_zero_point.const_data_ptr<int32_t>()[0], // mat1_zero_bias
       -in_zero_point, // mat2_zero_bias
-      out_multiplier.const_data_ptr<int32_t>()[0], // out_multiplier
-      out_shift.const_data_ptr<int32_t>()[0], // out_shift
-      out_zero_point); // out_zero_bias
+      out_multiplier.const_data_ptr<int32_t>()[0],
+      out_shift.const_data_ptr<int32_t>()[0],
+      out_zero_point);
   ET_DCHECK_MSG(ret == 0, "HiFi quantized::linear failed");
 }
 
diff --git a/backends/cadence/hifi/third-party/nnlib/nnlib-hifi4 b/backends/cadence/hifi/third-party/nnlib/nnlib-hifi4
index 6a9ea45e23..102944a6f7 160000
--- a/backends/cadence/hifi/third-party/nnlib/nnlib-hifi4
+++ b/backends/cadence/hifi/third-party/nnlib/nnlib-hifi4
@@ -1 +1 @@
-Subproject commit 6a9ea45e23ef591fe207442df33a5ebe88bbe8de
+Subproject commit 102944a6f76a0de4d81adc431f3f132f517aa87f
diff --git a/backends/cadence/hifi/third-party/nnlib/xa_nn_broadcast_32.c b/backends/cadence/hifi/third-party/nnlib/xa_nn_broadcast_32.c
new file mode 100644
index 0000000000..cad3f1a25b
--- /dev/null
+++ b/backends/cadence/hifi/third-party/nnlib/xa_nn_broadcast_32.c
@@ -0,0 +1,313 @@
+/*******************************************************************************
+* Copyright (c) 2018-2024 Cadence Design Systems, Inc.
+*
+* Permission is hereby granted, free of charge, to any person obtaining
+* a copy of this software and associated documentation files (the
+* "Software"), to use this Software with Cadence processor cores only and
+* not with any other processors and platforms, subject to
+* the following conditions:
+*
+* The above copyright notice and this permission notice shall be included
+* in all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+******************************************************************************/
+/*
+ * xa_nn_broadcast_8_8.c
+ */
+
+#include "xa_nnlib_common.h"
+//#include "xa_nn_basic_state.h"
+
+#include<string.h>
+#include<stdbool.h>
+
+#include "stdio.h"
+
+/*
+ * This file is sourced from ../hifi5/xa_nn_broadcast_8_8.c
+ */
+
+#define NUMDIMS_MAX 8
+
+typedef struct bcast_expansion_struct_{
+    size_t load_num_elem;
+    int    replicate_loadedElm_times;
+    int    repeat_operation;
+} bcast_expansion_rule ;
+
+WORD32* broadcast_node_32(bcast_expansion_rule *steps, unsigned int step_id,
+        WORD32 *dst, WORD32 *src);
+
+void *xa_nn_memcpy(void * dest1,const void *src1, size_t n1)
+{
+  char *dest = (char *)dest1;
+  char *src = (char *)src1;
+  int n = (int)n1;
+  ae_int16x4 * __restrict d_align_addr, * __restrict s_align_addr;
+  int i;
+  void *orig_dest = dest;
+
+  if (n < 32) {
+    return memcpy(dest, src, n);
+  }
+
+  if ( !(((int) dest) %8) && !(((int) src) %8)) { // 64-bit aligned
+    s_align_addr = (ae_int16x4 *) src;
+    d_align_addr = (ae_int16x4 *) dest;
+    for (i=0; i<n>>3; i++) {
+        d_align_addr[i] = s_align_addr[i];
+    }
+
+    for (i=(n&~7); i<n; i++) {
+      dest[i] = src[i];
+    }
+    return orig_dest;
+  }
+
+  if ( (((int) dest) %2) || (((int) src) %2)) { // 16-bit aligned
+    if ( (((int) dest) %2) && (((int) src) %2)) { // 16-bit aligned
+      *dest++ = *src++;
+       n--;
+    } else {
+      #if 0
+      return memcpy(dest, src, n);
+      #else
+        ae_int32x2 *pOut = (ae_int32x2 *)dest;
+        ae_int32x2 *pInp = (ae_int32x2 *)src;
+        ae_valign alignIn, alignOut;
+        alignIn = AE_LA64_PP(pInp);
+        alignOut = AE_ZALIGN64();
+        ae_int24x2 d0;
+        int Nby6 =  AE_MOVAD32_H(AE_MOVINT32X2_FROMINT64(AE_MUL32_LL(n, 0x2AAAAAAB)));
+        int remainder_start = 6*Nby6;
+
+        for(i=0;i<Nby6;i++)
+        {
+          AE_LA24X2_IP(d0, alignIn, pInp);
+          AE_SA24X2_IP(d0, alignOut, pOut);
+        }
+        AE_SA64POS_FP(alignOut, pOut);
+        /* remainder loop */
+        for(i=remainder_start; i < n; i++){
+          dest[i] = src[i];
+      }
+      return orig_dest;
+      #endif
+    }
+  }
+  int n2 = n/2;
+  ae_valign d_align = AE_ZALIGN64();
+  d_align_addr = (ae_int16x4 *) dest;
+  s_align_addr = (ae_int16x4 *) src;
+  ae_valign s_align = AE_LA64_PP(s_align_addr);
+  ae_int16x4 t,t2;
+  for (i=0; i<n2>>3; i++) {
+      AE_LA16X4_IP(t, s_align, s_align_addr);
+      AE_LA16X4_IP(t2, s_align, s_align_addr);
+      AE_SA16X4_IP(t, d_align, d_align_addr);
+      AE_SA16X4_IP(t2, d_align, d_align_addr);
+  }
+  AE_SA64POS_FP(d_align, d_align_addr);
+  ae_int16 *s_src = (ae_int16 *) src;
+  ae_int16 *s_dest = (ae_int16 *) dest;
+  for (i=8*i; i<n2; i++) {
+    s_dest[i] = s_src[i];
+  }
+  if (n % 2) {
+    dest[n-1] = src[n-1];
+  }
+  return orig_dest;
+} /* xa_nn_memcpy */
+
+WORD32 xa_nn_broadcast_32_32( WORD32* __restrict__ p_out,      /* pointer to write broadcasted output data to */
+        const int *const out_shape,         /* output shape resulting after broadcast */
+
+        WORD32* __restrict__ p_in,    /* pointer to unextended input data */
+        const int * const in_shape,         /* input shape */
+        int num_dims)
+{
+
+    /* NULL pointer checks */
+    XA_NNLIB_ARG_CHK_PTR(p_out, -1);
+    XA_NNLIB_ARG_CHK_PTR(out_shape, -1);
+    XA_NNLIB_ARG_CHK_PTR(p_in, -1);
+    XA_NNLIB_ARG_CHK_PTR(in_shape, -1);
+
+    /* IO shape pointer alignment checks */
+    XA_NNLIB_ARG_CHK_ALIGN(in_shape, sizeof(WORD32), -1);
+    XA_NNLIB_ARG_CHK_ALIGN(out_shape, sizeof(WORD32), -1);
+
+    /* Check if number of dims is valid */
+    XA_NNLIB_ARG_CHK_COND(num_dims<=0 || num_dims>8, -1);
+
+    int i = 0;
+
+    /* Check for valid IO shapes */
+    for(i=0; i<num_dims; i++){
+        XA_NNLIB_CHK_COND(in_shape[i]<=0, -1);
+        XA_NNLIB_CHK_COND(out_shape[i]<=0, -1);
+    }
+
+    /* Check if input shape can be broadcasted to requested output shape */
+    for(i=0; i<num_dims; i++){
+        if(in_shape[i] != out_shape[i]){
+            /* in_shape is either same as out_shape or 1 */
+            XA_NNLIB_CHK_COND( in_shape[i] != 1, -1);
+        }
+    }
+
+    /* bcast_expansion_steps contains a sequence to steps execute for a broadcast op */
+    bcast_expansion_rule bcast_expansion_steps[NUMDIMS_MAX] = {{0}};
+
+    int k=0;
+    int dim=0;
+    const void *res=0;
+
+    int num_elem_load = 1;
+    int num_copy_times = 1;
+    int num_repeat = 1;
+
+    dim = num_dims-1;
+    while(dim>=0){
+
+        /* Find the sub-matrix size */
+        while(in_shape[dim] != 1 && dim>=0){
+            num_elem_load *= out_shape[dim];
+            dim--;
+        }
+
+        /* Find the number of times this sub-matrix needs to be copied */
+        num_copy_times = 1;
+        while(in_shape[dim] == 1 && dim>=0){
+            num_copy_times *= out_shape[dim];
+            dim--;
+        }
+
+        /* Find the number of times the above copy needs to be repeated */
+        num_repeat = 1;
+        while(in_shape[dim] != 1 && dim>=0){
+            num_repeat *= 1 * out_shape[dim];
+            dim--;
+        }
+
+        bcast_expansion_steps[k].load_num_elem  = num_elem_load;
+        bcast_expansion_steps[k].replicate_loadedElm_times = num_copy_times;
+        bcast_expansion_steps[k].repeat_operation = num_repeat;
+        k++;
+
+        num_elem_load = num_elem_load * num_copy_times * num_repeat;
+    }
+
+    res = broadcast_node_32(bcast_expansion_steps, num_dims-1,
+            p_out, p_in);
+    (void)res; /* Unused return value */
+
+    return 0;
+}
+
+WORD32* broadcast_node_32(bcast_expansion_rule *steps, unsigned int step_id,
+        WORD32 *dst, WORD32 *src) {
+    int step_itr=0, rep_itr=0;
+    int i=0, j=0, k=0;
+    bcast_expansion_rule *step = NULL;
+
+    // ignore steps that are null
+    while(steps[step_id].repeat_operation == 0 && step_id>0){
+        step_id--;
+    }
+
+    // step is now the parent node for this iteration
+    step = &steps[step_id];
+    size_t numLoadedElm = step->load_num_elem;
+
+    WORD32 *cp_dst = dst;
+    WORD32 *cp_src = src;
+    WORD32 *cp_src_temp=NULL;
+    WORD32 *cp_dst_temp=NULL;
+
+    if(numLoadedElm>32){
+        if(step_id > 0){
+            for(step_itr=0; step_itr<step->repeat_operation; step_itr++){
+                src = broadcast_node_32(steps, step_id-1, dst, src);
+                cp_src = dst;
+                cp_dst = dst + numLoadedElm;
+                for(rep_itr=1; rep_itr<step->replicate_loadedElm_times; rep_itr++){
+                    xa_nn_memcpy(cp_dst, cp_src, 4 * numLoadedElm);
+                    cp_dst += numLoadedElm;
+                }
+                dst = cp_dst;
+            }
+            return src;
+        } else {
+            if(numLoadedElm == 1){
+                for(j=0; j<step->repeat_operation; j++){
+//                    memset((void*)cp_dst, (void*)cp_src, 4 * step->replicate_loadedElm_times);
+                	for(i = 0; i < step->replicate_loadedElm_times; i++)
+                		cp_dst[i] = cp_src[0];
+                    cp_dst += step->replicate_loadedElm_times;
+                    cp_src++;
+                }
+            } else {
+                for(j=0; j<step->repeat_operation; j++){
+                    for(i=0; i<step->replicate_loadedElm_times; i++){
+                        xa_nn_memcpy(cp_dst, cp_src, 4 * numLoadedElm);
+                        cp_dst += numLoadedElm;
+                    }
+                    cp_src += numLoadedElm;
+                }
+            }
+            return cp_src;
+        }
+    }
+    else{
+        if(step_id > 0){
+            for(step_itr=0; step_itr<step->repeat_operation; step_itr++){
+                src = broadcast_node_32(steps, step_id-1, dst, src);
+                cp_src = dst;
+                cp_dst = dst + numLoadedElm;
+                for(rep_itr=1; rep_itr<step->replicate_loadedElm_times; rep_itr++){
+                    for(k=0; k<(int)numLoadedElm; k++){
+                        cp_src_temp = cp_src;
+                        cp_dst_temp = cp_dst;
+                        cp_dst_temp[k] = cp_src_temp[k];
+                    }
+                    cp_dst += numLoadedElm;
+                }
+                dst = cp_dst;
+            }
+            return src;
+        } else {
+            if(numLoadedElm == 1){
+                for(j=0; j<step->repeat_operation; j++){
+//                    memset((void*)cp_dst, *(WORD32 *)cp_src, 4 * step->replicate_loadedElm_times);
+                	for(i = 0; i < step->replicate_loadedElm_times; i++)
+                		cp_dst[i] = cp_src[0];
+                    cp_dst += step->replicate_loadedElm_times;
+                    cp_src++;
+                }
+            } else {
+                for(j=0; j < step->repeat_operation; j++){
+                    for(i=0; i < step->replicate_loadedElm_times; i++){
+                        for(k=0; k<(int)(numLoadedElm); k++){
+                            cp_src_temp = cp_src;
+                            cp_dst_temp = cp_dst;
+                            cp_dst_temp[k] = cp_src_temp[k];
+
+                        }
+                        cp_dst += numLoadedElm;
+                    }
+                    cp_src += numLoadedElm;
+                }
+            }
+            return cp_src;
+        }
+    }
+}
diff --git a/backends/cadence/hifi/third-party/nnlib/xa_nn_broadcast_32_32.c b/backends/cadence/hifi/third-party/nnlib/xa_nn_broadcast_32_32.c
new file mode 100644
index 0000000000..34a7111ee7
--- /dev/null
+++ b/backends/cadence/hifi/third-party/nnlib/xa_nn_broadcast_32_32.c
@@ -0,0 +1,313 @@
+/*******************************************************************************
+* Copyright (c) 2018-2024 Cadence Design Systems, Inc.
+*
+* Permission is hereby granted, free of charge, to any person obtaining
+* a copy of this software and associated documentation files (the
+* "Software"), to use this Software with Cadence processor cores only and
+* not with any other processors and platforms, subject to
+* the following conditions:
+*
+* The above copyright notice and this permission notice shall be included
+* in all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+******************************************************************************/
+/*
+ * xa_nn_broadcast_32_32.c
+ */
+
+#include "xa_nnlib_common.h"
+//#include "xa_nn_basic_state.h"
+
+#include<string.h>
+#include<stdbool.h>
+
+#include "stdio.h"
+
+/*
+ * This file is sourced from ../hifi5/xa_nn_broadcast_8_8.c
+ */
+
+#define NUMDIMS_MAX 8
+
+typedef struct bcast_expansion_struct_{
+    size_t load_num_elem;
+    int    replicate_loadedElm_times;
+    int    repeat_operation;
+} bcast_expansion_rule ;
+
+WORD32* broadcast_node_32(bcast_expansion_rule *steps, unsigned int step_id,
+        WORD32 *dst, WORD32 *src);
+
+void *xa_nn_memcpy(void * dest1,const void *src1, size_t n1)
+{
+  char *dest = (char *)dest1;
+  char *src = (char *)src1;
+  int n = (int)n1;
+  ae_int16x4 * __restrict d_align_addr, * __restrict s_align_addr;
+  int i;
+  void *orig_dest = dest;
+
+  if (n < 32) {
+    return memcpy(dest, src, n);
+  }
+
+  if ( !(((int) dest) %8) && !(((int) src) %8)) { // 64-bit aligned
+    s_align_addr = (ae_int16x4 *) src;
+    d_align_addr = (ae_int16x4 *) dest;
+    for (i=0; i<n>>3; i++) {
+        d_align_addr[i] = s_align_addr[i];
+    }
+
+    for (i=(n&~7); i<n; i++) {
+      dest[i] = src[i];
+    }
+    return orig_dest;
+  }
+
+  if ( (((int) dest) %2) || (((int) src) %2)) { // 16-bit aligned
+    if ( (((int) dest) %2) && (((int) src) %2)) { // 16-bit aligned
+      *dest++ = *src++;
+       n--;
+    } else {
+      #if 0
+      return memcpy(dest, src, n);
+      #else
+        ae_int32x2 *pOut = (ae_int32x2 *)dest;
+        ae_int32x2 *pInp = (ae_int32x2 *)src;
+        ae_valign alignIn, alignOut;
+        alignIn = AE_LA64_PP(pInp);
+        alignOut = AE_ZALIGN64();
+        ae_int24x2 d0;
+        int Nby6 =  AE_MOVAD32_H(AE_MOVINT32X2_FROMINT64(AE_MUL32_LL(n, 0x2AAAAAAB)));
+        int remainder_start = 6*Nby6;
+
+        for(i=0;i<Nby6;i++)
+        {
+          AE_LA24X2_IP(d0, alignIn, pInp);
+          AE_SA24X2_IP(d0, alignOut, pOut);
+        }
+        AE_SA64POS_FP(alignOut, pOut);
+        /* remainder loop */
+        for(i=remainder_start; i < n; i++){
+          dest[i] = src[i];
+      }
+      return orig_dest;
+      #endif
+    }
+  }
+  int n2 = n/2;
+  ae_valign d_align = AE_ZALIGN64();
+  d_align_addr = (ae_int16x4 *) dest;
+  s_align_addr = (ae_int16x4 *) src;
+  ae_valign s_align = AE_LA64_PP(s_align_addr);
+  ae_int16x4 t,t2;
+  for (i=0; i<n2>>3; i++) {
+      AE_LA16X4_IP(t, s_align, s_align_addr);
+      AE_LA16X4_IP(t2, s_align, s_align_addr);
+      AE_SA16X4_IP(t, d_align, d_align_addr);
+      AE_SA16X4_IP(t2, d_align, d_align_addr);
+  }
+  AE_SA64POS_FP(d_align, d_align_addr);
+  ae_int16 *s_src = (ae_int16 *) src;
+  ae_int16 *s_dest = (ae_int16 *) dest;
+  for (i=8*i; i<n2; i++) {
+    s_dest[i] = s_src[i];
+  }
+  if (n % 2) {
+    dest[n-1] = src[n-1];
+  }
+  return orig_dest;
+} /* xa_nn_memcpy */
+
+WORD32 xa_nn_broadcast_32_32( WORD32* __restrict__ p_out,      /* pointer to write broadcasted output data to */
+        const int *const out_shape,         /* output shape resulting after broadcast */
+
+        WORD32* __restrict__ p_in,    /* pointer to unextended input data */
+        const int * const in_shape,         /* input shape */
+        int num_dims)
+{
+
+    /* NULL pointer checks */
+    XA_NNLIB_ARG_CHK_PTR(p_out, -1);
+    XA_NNLIB_ARG_CHK_PTR(out_shape, -1);
+    XA_NNLIB_ARG_CHK_PTR(p_in, -1);
+    XA_NNLIB_ARG_CHK_PTR(in_shape, -1);
+
+    /* IO shape pointer alignment checks */
+    XA_NNLIB_ARG_CHK_ALIGN(in_shape, sizeof(WORD32), -1);
+    XA_NNLIB_ARG_CHK_ALIGN(out_shape, sizeof(WORD32), -1);
+
+    /* Check if number of dims is valid */
+    XA_NNLIB_ARG_CHK_COND(num_dims<=0 || num_dims>8, -1);
+
+    int i = 0;
+
+    /* Check for valid IO shapes */
+    for(i=0; i<num_dims; i++){
+        XA_NNLIB_CHK_COND(in_shape[i]<=0, -1);
+        XA_NNLIB_CHK_COND(out_shape[i]<=0, -1);
+    }
+
+    /* Check if input shape can be broadcasted to requested output shape */
+    for(i=0; i<num_dims; i++){
+        if(in_shape[i] != out_shape[i]){
+            /* in_shape is either same as out_shape or 1 */
+            XA_NNLIB_CHK_COND( in_shape[i] != 1, -1);
+        }
+    }
+
+    /* bcast_expansion_steps contains a sequence to steps execute for a broadcast op */
+    bcast_expansion_rule bcast_expansion_steps[NUMDIMS_MAX] = {{0}};
+
+    int k=0;
+    int dim=0;
+    const void *res=0;
+
+    int num_elem_load = 1;
+    int num_copy_times = 1;
+    int num_repeat = 1;
+
+    dim = num_dims-1;
+    while(dim>=0){
+
+        /* Find the sub-matrix size */
+        while(in_shape[dim] != 1 && dim>=0){
+            num_elem_load *= out_shape[dim];
+            dim--;
+        }
+
+        /* Find the number of times this sub-matrix needs to be copied */
+        num_copy_times = 1;
+        while(in_shape[dim] == 1 && dim>=0){
+            num_copy_times *= out_shape[dim];
+            dim--;
+        }
+
+        /* Find the number of times the above copy needs to be repeated */
+        num_repeat = 1;
+        while(in_shape[dim] != 1 && dim>=0){
+            num_repeat *= 1 * out_shape[dim];
+            dim--;
+        }
+
+        bcast_expansion_steps[k].load_num_elem  = num_elem_load;
+        bcast_expansion_steps[k].replicate_loadedElm_times = num_copy_times;
+        bcast_expansion_steps[k].repeat_operation = num_repeat;
+        k++;
+
+        num_elem_load = num_elem_load * num_copy_times * num_repeat;
+    }
+
+    res = broadcast_node_32(bcast_expansion_steps, num_dims-1,
+            p_out, p_in);
+    (void)res; /* Unused return value */
+
+    return 0;
+}
+
+WORD32* broadcast_node_32(bcast_expansion_rule *steps, unsigned int step_id,
+        WORD32 *dst, WORD32 *src) {
+    int step_itr=0, rep_itr=0;
+    int i=0, j=0, k=0;
+    bcast_expansion_rule *step = NULL;
+
+    // ignore steps that are null
+    while(steps[step_id].repeat_operation == 0 && step_id>0){
+        step_id--;
+    }
+
+    // step is now the parent node for this iteration
+    step = &steps[step_id];
+    size_t numLoadedElm = step->load_num_elem;
+
+    WORD32 *cp_dst = dst;
+    WORD32 *cp_src = src;
+    WORD32 *cp_src_temp=NULL;
+    WORD32 *cp_dst_temp=NULL;
+
+    if(numLoadedElm>32){
+        if(step_id > 0){
+            for(step_itr=0; step_itr<step->repeat_operation; step_itr++){
+                src = broadcast_node_32(steps, step_id-1, dst, src);
+                cp_src = dst;
+                cp_dst = dst + numLoadedElm;
+                for(rep_itr=1; rep_itr<step->replicate_loadedElm_times; rep_itr++){
+                    xa_nn_memcpy(cp_dst, cp_src, 4 * numLoadedElm);
+                    cp_dst += numLoadedElm;
+                }
+                dst = cp_dst;
+            }
+            return src;
+        } else {
+            if(numLoadedElm == 1){
+                for(j=0; j<step->repeat_operation; j++){
+//                    memset((void*)cp_dst, (void*)cp_src, 4 * step->replicate_loadedElm_times);
+                	for(i = 0; i < step->replicate_loadedElm_times; i++)
+                		cp_dst[i] = cp_src[0];
+                    cp_dst += step->replicate_loadedElm_times;
+                    cp_src++;
+                }
+            } else {
+                for(j=0; j<step->repeat_operation; j++){
+                    for(i=0; i<step->replicate_loadedElm_times; i++){
+                        xa_nn_memcpy(cp_dst, cp_src, 4 * numLoadedElm);
+                        cp_dst += numLoadedElm;
+                    }
+                    cp_src += numLoadedElm;
+                }
+            }
+            return cp_src;
+        }
+    }
+    else{
+        if(step_id > 0){
+            for(step_itr=0; step_itr<step->repeat_operation; step_itr++){
+                src = broadcast_node_32(steps, step_id-1, dst, src);
+                cp_src = dst;
+                cp_dst = dst + numLoadedElm;
+                for(rep_itr=1; rep_itr<step->replicate_loadedElm_times; rep_itr++){
+                    for(k=0; k<(int)numLoadedElm; k++){
+                        cp_src_temp = cp_src;
+                        cp_dst_temp = cp_dst;
+                        cp_dst_temp[k] = cp_src_temp[k];
+                    }
+                    cp_dst += numLoadedElm;
+                }
+                dst = cp_dst;
+            }
+            return src;
+        } else {
+            if(numLoadedElm == 1){
+                for(j=0; j<step->repeat_operation; j++){
+//                    memset((void*)cp_dst, *(WORD32 *)cp_src, 4 * step->replicate_loadedElm_times);
+                	for(i = 0; i < step->replicate_loadedElm_times; i++)
+                		cp_dst[i] = cp_src[0];
+                    cp_dst += step->replicate_loadedElm_times;
+                    cp_src++;
+                }
+            } else {
+                for(j=0; j < step->repeat_operation; j++){
+                    for(i=0; i < step->replicate_loadedElm_times; i++){
+                        for(k=0; k<(int)(numLoadedElm); k++){
+                            cp_src_temp = cp_src;
+                            cp_dst_temp = cp_dst;
+                            cp_dst_temp[k] = cp_src_temp[k];
+
+                        }
+                        cp_dst += numLoadedElm;
+                    }
+                    cp_src += numLoadedElm;
+                }
+            }
+            return cp_src;
+        }
+    }
+}
diff --git a/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_minimum_maximum_f32.c b/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_minimum_maximum_f32.c
new file mode 100644
index 0000000000..3af93fc00c
--- /dev/null
+++ b/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_minimum_maximum_f32.c
@@ -0,0 +1,847 @@
+/*******************************************************************************
+* Copyright (c) 2018-2024 Cadence Design Systems, Inc.
+*
+* Permission is hereby granted, free of charge, to any person obtaining
+* a copy of this software and associated documentation files (the
+* "Software"), to use this Software with Cadence processor cores only and
+* not with any other processors and platforms, subject to
+* the following conditions:
+*
+* The above copyright notice and this permission notice shall be included
+* in all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+******************************************************************************/
+#include "nnlib-hifi4/xa_nnlib/include/xa_type_def.h"
+#include "nnlib-hifi4/xa_nnlib/algo/common/include/xa_nnlib_common_fpu.h"
+#include "nnlib-hifi4/xa_nnlib/algo/common/include/xa_nn_common.h"
+#include "nnlib-hifi4/xa_nnlib/algo/common/include/xa_nnlib_err_chk.h"
+#include "nnlib-hifi4/xa_nnlib/algo/kernels/basic/hifi4/xa_nn_basic_state.h"
+#include "nnlib-hifi4/xa_nnlib/include/nnlib/xa_nnlib_kernels_api.h"
+
+#if !HAVE_VFPU
+DISCARD_FUN_FOR_NONVOID_RETURN(
+             WORD32, xa_nn_elm_maximum_f32xf32_f32,
+             (
+                FLOAT32 *p_out,
+                const FLOAT32 *p_inp1,
+                const FLOAT32 *p_inp2,
+                WORD32 num_elm
+              )
+           )
+#else
+WORD32 xa_nn_elm_maximum_f32xf32_f32(FLOAT32 * __restrict__ p_out,
+                               const FLOAT32 * __restrict__ p_inp1,
+                               const FLOAT32 * __restrict__ p_inp2,
+                               WORD32 num_elm)
+{
+
+    /* NULL pointer checks */
+    XA_NNLIB_ARG_CHK_PTR(p_out, -1);
+    XA_NNLIB_ARG_CHK_PTR(p_inp1, -1);
+    XA_NNLIB_ARG_CHK_PTR(p_inp2, -1);
+    /* Pointer alignment checks */
+    XA_NNLIB_ARG_CHK_ALIGN(p_out, sizeof(FLOAT32), -1);
+    XA_NNLIB_ARG_CHK_ALIGN(p_inp1, sizeof(FLOAT32), -1);
+    XA_NNLIB_ARG_CHK_ALIGN(p_inp2, sizeof(FLOAT32), -1);
+    /* Basic Parameter checks */
+    XA_NNLIB_ARG_CHK_COND((num_elm <= 0), -1);
+
+    int i;
+    xtfloatx2 *inp1 = (xtfloatx2 *)p_inp1;
+    xtfloatx2 *inp2 = (xtfloatx2 *)p_inp2;
+    xtfloatx2 *out =  (xtfloatx2 *)p_out;
+    xtfloatx2 x1, x2, y;
+    unsigned char con1, con2;
+    xtbool2 con = int32_rtor_xtbool2(0x00000003);
+
+    if(((((unsigned)p_out)&7) == 0) && ((((unsigned)p_inp1)&7) == 0) && ((((unsigned)p_inp2)&7) == 0))
+    {
+        for(i=0;i < num_elm>>1;i++)
+        {
+            XT_LSX2IP(x1, inp1, 2*sizeof(FLOAT32));
+            XT_LSX2IP(x2, inp2, 2*sizeof(FLOAT32));
+            y = XT_MAX_SX2(x2, x1);
+            XT_SSX2IP( y, out,  2*sizeof(FLOAT32));
+        }
+    }
+    else
+    {
+        ae_valign inp1_a, inp2_a, out_a;
+
+        inp1_a = XT_LASX2PP(inp1);
+        inp2_a = XT_LASX2PP(inp2);
+        out_a = AE_ZALIGN64();
+        /* Each iteration of loop is independent so safe to use concurrent pragma */
+#pragma concurrent
+        for(i=0;i < num_elm>>1;i++)
+        {
+            XT_LASX2IP(x1, inp1_a, inp1);
+            XT_LASX2IP(x2, inp2_a, inp2);
+            y = XT_MAX_SX2(x2, x1);
+            XT_SASX2IP(y, out_a, out);
+        }
+        XT_SASX2POSFP(out_a, out);
+    }
+    // Remainder Loop
+    if (num_elm & 1)
+    {
+        xtfloat a1, a2, a;
+        XT_LSIP(a1, (xtfloat *)inp1, 0);
+        XT_LSIP(a2, (xtfloat *)inp2, 0);
+        a = XT_MAX_S(a1, a2);   
+        XT_SSI(a, (xtfloat *)out, 0);
+    }
+    return 0;
+}
+#endif
+
+#if HAVE_VFPU
+static void internal_elm_maximum_broadcast_2D_f32xf32_f32(FLOAT32 * __restrict__ p_out,
+                    const    FLOAT32 * __restrict__ p_inp1,
+                    const    FLOAT32 * __restrict__ p_inp2,
+                             WORD32  out_lc,
+                             WORD32  in_lc,
+                             xtbool  sign_flag)
+{
+  int i, j;
+
+  xtfloatx2  * __restrict__ p_a = (xtfloatx2 *)p_inp1;
+  xtfloatx2  * __restrict__ p_b = (xtfloatx2 *)p_inp2; 
+  xtfloatx2  *__restrict__  p_c =  (xtfloatx2 *)p_out;
+
+  int num_simd2_ops;
+  int num_scalar_ops;
+
+  if(out_lc)
+  {
+    num_simd2_ops = in_lc >> 1;
+    num_scalar_ops = in_lc & 1;
+  }
+  else
+  {
+    num_simd2_ops = (in_lc >> 2) << 1;
+    num_scalar_ops = in_lc & 3;
+  }
+
+    xtfloatx2 x1, x2, y;
+    xtfloat a0, b0, c0;
+ 
+  for(i = 0; i < out_lc; i++)
+  {
+    p_a = (xtfloatx2 *)&p_inp1[i * in_lc];
+    p_b = (xtfloatx2 *)p_inp2;
+    p_c = (xtfloatx2 *)&p_out[i * in_lc];
+    if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_b)&7) == 0) && ((((unsigned)p_c)&7) == 0))
+    {
+      for(j = 0; j < num_simd2_ops; j++)
+      {
+        XT_LSX2IP(x1, p_a, 2 * sizeof(FLOAT32));
+        XT_LSX2IP(x2, p_b, 2 * sizeof(FLOAT32));
+        y = XT_MAX_SX2(x2, x1);
+        XT_SSX2IP(y, p_c, 2 * sizeof(FLOAT32)); 
+      }
+    }
+    else
+    {
+      ae_valign vinp1, vinp2, out_a = AE_ZALIGN64();
+      vinp1 = XT_LASX2PP(p_a);
+      vinp2 = XT_LASX2PP(p_b);
+      for(j = 0; j < num_simd2_ops; j++)
+      {
+        XT_LASX2IP(x1, vinp1, p_a);
+        XT_LASX2IP(x2, vinp2, p_b);
+        y = XT_MAX_SX2(x2, x1);
+        XT_SASX2IP(y, out_a, p_c); 
+      }
+      XT_SASX2POSFP(out_a, (xtfloatx2 *)p_c);
+    }
+    if(num_scalar_ops !=0)
+    {
+      XT_LSIP(a0, (xtfloat *)p_a, sizeof(FLOAT32));
+      XT_LSIP(b0, (xtfloat *)p_b, sizeof(FLOAT32));
+      c0 = XT_MAX_S(b0, a0);   
+      XT_SSI(c0, (xtfloat *)p_c, 0);
+    }
+  }
+}
+
+static void internal_elm_maximum_broadcast_f32xf32_f32(FLOAT32 * __restrict__ p_out,
+                    const    FLOAT32 * __restrict__ p_inp1,
+                    const    FLOAT32 * __restrict__ p_inp2,
+                             WORD32  num_elm,
+                             xtbool  sign_flag)
+{
+  int i;
+  xtfloatx2  * __restrict__ p_a = (xtfloatx2 *)p_inp1;
+  xtfloatx2  * __restrict__ p_b = (xtfloatx2 *)p_inp2; 
+  xtfloatx2  *__restrict__  p_c =  (xtfloatx2 *)p_out;
+
+  const int num_simd2_ops = num_elm >> 1;
+  const int num_scalar_ops = num_elm & 1;
+
+  xtfloat a0_7, out;
+  xtfloatx2 x1, x2, y;
+  x2 = XT_LSI((xtfloat *)p_b, 0);
+        
+  if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_c)&7) == 0))
+  {
+    for(i=0; i<num_simd2_ops; i++)
+    {
+      XT_LSX2IP(x1, p_a, 2 * sizeof(FLOAT32));
+      y = XT_MAX_SX2(x2, x1);
+      XT_SSX2IP(y, p_c, 2 * sizeof(FLOAT32)); 
+    }
+  }
+  else
+  {
+    ae_valign inp1_a, out_a;
+    inp1_a = XT_LASX2PP(p_a);
+    out_a = AE_ZALIGN64();      
+    for(i=0; i<num_simd2_ops; i++)
+    {
+      XT_LASX2IP(x1, inp1_a, p_a);
+      y = XT_MAX_SX2(x2, x1);
+      XT_SASX2IP(y, out_a, p_c);
+    }
+    XT_SASX2POSFP(out_a, (xtfloatx2 *)p_c);   
+  }  
+  if(num_scalar_ops !=0)
+  {
+    XT_LSIP(a0_7, (xtfloat *)p_a, sizeof(FLOAT32));
+    out = XT_MAX_S(x2, a0_7);   
+    XT_SSI(out, (xtfloat *)p_c, 0);
+  }
+}
+#endif
+
+#if !HAVE_VFPU
+DISCARD_FUN_FOR_NONVOID_RETURN(
+             WORD32, xa_nn_elm_maximum_broadcast_4D_f32xf32_f32,
+             (
+                      FLOAT32 * p_out,
+                      const WORD32 *const p_out_shape,
+                      const FLOAT32 * p_inp1,
+                      const WORD32 *const p_inp1_shape,
+                      const FLOAT32 * p_inp2,
+                      const WORD32 *const p_inp2_shape
+              )
+           )
+#else           
+WORD32 xa_nn_elm_maximum_broadcast_4D_f32xf32_f32(FLOAT32 * __restrict__ p_out,
+                      const WORD32 *const p_out_shape,
+                      const FLOAT32 * __restrict__ p_inp1,
+                      const WORD32 *const p_inp1_shape,
+                      const FLOAT32 * __restrict__ p_inp2,
+                      const WORD32 *const p_inp2_shape)
+{
+  /* NULL pointer checks */
+  XA_NNLIB_ARG_CHK_PTR(p_out, -1);
+  XA_NNLIB_ARG_CHK_PTR(p_inp1, -1);
+  XA_NNLIB_ARG_CHK_PTR(p_inp2, -1);
+  XA_NNLIB_ARG_CHK_PTR(p_out_shape, -1);
+  XA_NNLIB_ARG_CHK_PTR(p_inp1_shape, -1);
+  XA_NNLIB_ARG_CHK_PTR(p_inp2_shape, -1);
+  /* Pointer alignment checks */
+  XA_NNLIB_ARG_CHK_ALIGN(p_out, sizeof(FLOAT32), -1);
+  XA_NNLIB_ARG_CHK_ALIGN(p_inp1, sizeof(FLOAT32), -1);
+  XA_NNLIB_ARG_CHK_ALIGN(p_inp2, sizeof(FLOAT32), -1);
+  XA_NNLIB_ARG_CHK_ALIGN(p_out_shape, sizeof(WORD32), -1);
+  XA_NNLIB_ARG_CHK_ALIGN(p_inp1_shape, sizeof(WORD32), -1);
+  XA_NNLIB_ARG_CHK_ALIGN(p_inp2_shape, sizeof(WORD32), -1);
+
+  /* Check shapes */
+  int i;
+  xtbool sign_flag;
+  for(i = 0; i < 4; i++)
+  {
+    if((p_inp1_shape[i] != p_inp2_shape[i] && p_inp1_shape[i] != 1 && p_inp2_shape[i] != 1) ||
+       (p_out_shape[i] != (p_inp1_shape[i] > p_inp2_shape[i] ? p_inp1_shape[i] : p_inp2_shape[i])))
+    {
+      return -1;
+    }
+  }
+
+  WORD32 inp1_strides[4], inp2_strides[4];
+  inp1_strides[3] = 1;
+  inp2_strides[3] = 1;
+  for(i = 2; i >= 0; i--)
+  {
+    ae_int32x2 d_str, d_shape;
+    d_str = AE_MOVDA32X2(inp1_strides[i + 1], inp2_strides[i + 1]);
+    d_shape = AE_MOVDA32X2(p_inp1_shape[i + 1], p_inp2_shape[i + 1]);
+    d_str = AE_MULP32X2(d_str, d_shape);
+    inp1_strides[i] = AE_MOVAD32_H(d_str);
+    inp2_strides[i] = AE_MOVAD32_L(d_str);
+  }
+
+  int need_broadcast = 0;
+  int inp1_const = 1, inp2_const = 1;
+  for(i = 0; i < 4; i++)
+  {
+    if(p_inp1_shape[i] != p_inp2_shape[i])
+    {
+      if(p_inp1_shape[i] == 1)
+        inp1_strides[i] = 0;
+      else
+        inp2_strides[i] = 0;
+
+      need_broadcast = 1;
+    }
+    if(p_inp1_shape[i] != 1)
+      inp1_const &= 0;
+    if(p_inp2_shape[i] != 1)
+      inp2_const &= 0;
+  }
+  int itr0, itr1, itr2;
+
+  FLOAT32 *p_out_tmp = p_out;
+  const FLOAT32 *__restrict__ p_inp1_tmp = p_inp1;
+  const FLOAT32 *__restrict__ p_inp2_tmp = p_inp2;
+  if(need_broadcast == 0)
+  {
+    sign_flag = 0;
+    internal_elm_maximum_broadcast_2D_f32xf32_f32(
+                p_out,
+                p_inp1,
+                p_inp2,
+                1,
+                p_out_shape[0] * inp1_strides[0],
+                sign_flag);
+  }
+  else if(inp1_strides[3] == inp2_strides[3])
+  {
+    WORD32 in_lc, out_lc;
+    sign_flag = 0;
+    in_lc = p_out_shape[2] * p_out_shape[3];
+    out_lc = 1;
+    if(inp1_strides[2] == 0)
+    {
+      const FLOAT32 *tmp;
+      tmp = p_inp1_tmp;   p_inp1_tmp = p_inp2_tmp;    p_inp2_tmp = tmp;
+      sign_flag = 1;
+      int tmp_strides[2];
+      tmp_strides[0] = inp1_strides[0];
+      tmp_strides[1] = inp1_strides[1];
+
+      inp1_strides[0] = inp2_strides[0];
+      inp1_strides[1] = inp2_strides[1];
+
+      inp2_strides[0] = tmp_strides[0];
+      inp2_strides[1] = tmp_strides[1];
+      in_lc = p_out_shape[3];
+      out_lc = p_out_shape[2];
+    }
+    else if(inp2_strides[2] == 0)
+    {
+      in_lc = p_out_shape[3];
+      out_lc = p_out_shape[2];
+    }
+
+    for(itr0 = 0; itr0 < p_out_shape[0]; itr0++)
+    {
+      const FLOAT32 *__restrict__ p_inp1_tmp0 = p_inp1_tmp;
+      const FLOAT32 *__restrict__ p_inp2_tmp0 = p_inp2_tmp;
+      for(itr1 = 0; itr1 < p_out_shape[1]; itr1++)
+      {
+        internal_elm_maximum_broadcast_2D_f32xf32_f32(
+            p_out_tmp,
+            p_inp1_tmp0,
+            p_inp2_tmp0,
+            out_lc,
+            in_lc,
+            sign_flag);
+        p_out_tmp += in_lc * out_lc;
+        p_inp1_tmp0 += inp1_strides[1];
+        p_inp2_tmp0 += inp2_strides[1];
+      }
+      p_inp1_tmp += inp1_strides[0];
+      p_inp2_tmp += inp2_strides[0];
+    }
+  }
+  else if(inp1_const == 1 || inp2_const == 1)
+  {
+    sign_flag = 0;
+    if(inp1_strides[3] == 0)
+    {
+      sign_flag = 1;
+      const FLOAT32 *tmp;
+      tmp = p_inp1_tmp;   p_inp1_tmp = p_inp2_tmp;    p_inp2_tmp = tmp;
+    }
+    internal_elm_maximum_broadcast_f32xf32_f32(
+        p_out_tmp,
+        p_inp1_tmp,
+        p_inp2_tmp,
+        p_out_shape[0] * p_out_shape[1] * p_out_shape[2] * p_out_shape[3],
+        sign_flag);
+  }
+  else
+  {
+    sign_flag = 0;
+    if(inp1_strides[3] == 0)
+    {
+      const FLOAT32 *tmp;
+      tmp = p_inp1_tmp;   p_inp1_tmp = p_inp2_tmp;    p_inp2_tmp = tmp;
+      sign_flag = 1;
+      int tmp_strides[3];
+      tmp_strides[0] = inp1_strides[0];
+      tmp_strides[1] = inp1_strides[1];
+      tmp_strides[2] = inp1_strides[2];
+
+      inp1_strides[0] = inp2_strides[0];
+      inp1_strides[1] = inp2_strides[1];
+      inp1_strides[2] = inp2_strides[2];
+
+      inp2_strides[0] = tmp_strides[0];
+      inp2_strides[1] = tmp_strides[1];
+      inp2_strides[2] = tmp_strides[2];
+    }
+    for(itr0 = 0; itr0 < p_out_shape[0]; itr0++)
+    {
+      const FLOAT32 *__restrict__ p_inp1_tmp0 = p_inp1_tmp;
+      const FLOAT32 *__restrict__ p_inp2_tmp0 = p_inp2_tmp;
+      for(itr1 = 0; itr1 < p_out_shape[1]; itr1++)
+      {
+        const FLOAT32 *__restrict__ p_inp1_tmp1 = p_inp1_tmp0;
+        const FLOAT32 *__restrict__ p_inp2_tmp1 = p_inp2_tmp0;
+        for(itr2 = 0; itr2 < p_out_shape[2]; itr2++)
+        {
+          {
+            internal_elm_maximum_broadcast_f32xf32_f32(
+                p_out_tmp,
+                p_inp1_tmp1,
+                p_inp2_tmp1,
+                p_out_shape[3], 
+                sign_flag);
+          }
+          p_out_tmp += p_out_shape[3];
+          p_inp1_tmp1 += inp1_strides[2];
+          p_inp2_tmp1 += inp2_strides[2];
+        }
+        p_inp1_tmp0 += inp1_strides[1];
+        p_inp2_tmp0 += inp2_strides[1];
+      }
+      p_inp1_tmp += inp1_strides[0];
+      p_inp2_tmp += inp2_strides[0];
+    }
+  }
+  return 0;
+}
+#endif
+
+#if !HAVE_VFPU
+DISCARD_FUN_FOR_NONVOID_RETURN(
+             WORD32, xa_nn_elm_minimum_f32xf32_f32,
+             (
+                FLOAT32 *p_out,
+                const FLOAT32 *p_inp1,
+                const FLOAT32 *p_inp2,
+                WORD32 num_elm
+              )
+           )
+#else
+WORD32 xa_nn_elm_minimum_f32xf32_f32(FLOAT32 * __restrict__ p_out,
+                               const FLOAT32 * __restrict__ p_inp1,
+                               const FLOAT32 * __restrict__ p_inp2,
+                               WORD32 num_elm)
+{
+
+    /* NULL pointer checks */
+    XA_NNLIB_ARG_CHK_PTR(p_out, -1);
+    XA_NNLIB_ARG_CHK_PTR(p_inp1, -1);
+    XA_NNLIB_ARG_CHK_PTR(p_inp2, -1);
+    /* Pointer alignment checks */
+    XA_NNLIB_ARG_CHK_ALIGN(p_out, sizeof(FLOAT32), -1);
+    XA_NNLIB_ARG_CHK_ALIGN(p_inp1, sizeof(FLOAT32), -1);
+    XA_NNLIB_ARG_CHK_ALIGN(p_inp2, sizeof(FLOAT32), -1);
+    /* Basic Parameter checks */
+    XA_NNLIB_ARG_CHK_COND((num_elm <= 0), -1);
+
+    int i;
+    xtfloatx2 *inp1 = (xtfloatx2 *)p_inp1;
+    xtfloatx2 *inp2 = (xtfloatx2 *)p_inp2;
+    xtfloatx2 *out =  (xtfloatx2 *)p_out;
+    xtfloatx2 x1, x2, y;
+    unsigned char con1, con2;
+    xtbool2 con = int32_rtor_xtbool2(0x00000003);
+
+    if(((((unsigned)p_out)&7) == 0) && ((((unsigned)p_inp1)&7) == 0) && ((((unsigned)p_inp2)&7) == 0))
+    {
+        for(i=0;i < num_elm>>1;i++)
+        {
+            XT_LSX2IP(x1, inp1, 2*sizeof(FLOAT32));
+            XT_LSX2IP(x2, inp2, 2*sizeof(FLOAT32));
+            y = XT_MIN_SX2(x2, x1);
+            XT_SSX2IP( y, out,  2*sizeof(FLOAT32));
+        }
+    }
+    else
+    {
+        ae_valign inp1_a, inp2_a, out_a;
+
+        inp1_a = XT_LASX2PP(inp1);
+        inp2_a = XT_LASX2PP(inp2);
+        out_a = AE_ZALIGN64();
+        /* Each iteration of loop is independent so safe to use concurrent pragma */
+#pragma concurrent
+        for(i=0;i < num_elm>>1;i++)
+        {
+            XT_LASX2IP(x1, inp1_a, inp1);
+            XT_LASX2IP(x2, inp2_a, inp2);
+            y = XT_MIN_SX2(x2, x1);
+            XT_SASX2IP(y, out_a, out);
+        }
+        XT_SASX2POSFP(out_a, out);
+    }
+    // Remainder Loop
+    if (num_elm & 1)
+    {
+        xtfloat a1, a2, a;
+        XT_LSIP(a1, (xtfloat *)inp1, 0);
+        XT_LSIP(a2, (xtfloat *)inp2, 0);
+        a = XT_MIN_S(a1, a2);   
+        XT_SSI(a, (xtfloat *)out, 0);
+    }
+    return 0;
+}
+#endif
+
+#if HAVE_VFPU
+static void internal_elm_minimum_broadcast_2D_f32xf32_f32(FLOAT32 * __restrict__ p_out,
+                    const    FLOAT32 * __restrict__ p_inp1,
+                    const    FLOAT32 * __restrict__ p_inp2,
+                             WORD32  out_lc,
+                             WORD32  in_lc,
+                             xtbool  sign_flag)
+{
+  int i, j;
+
+  xtfloatx2  * __restrict__ p_a = (xtfloatx2 *)p_inp1;
+  xtfloatx2  * __restrict__ p_b = (xtfloatx2 *)p_inp2; 
+  xtfloatx2  *__restrict__  p_c =  (xtfloatx2 *)p_out;
+
+  int num_simd2_ops;
+  int num_scalar_ops;
+
+  if(out_lc)
+  {
+    num_simd2_ops = in_lc >> 1;
+    num_scalar_ops = in_lc & 1;
+  }
+  else
+  {
+    num_simd2_ops = (in_lc >> 2) << 1;
+    num_scalar_ops = in_lc & 3;
+  }
+
+    xtfloatx2 x1, x2, y;
+    xtfloat a0, b0, c0;
+ 
+  for(i = 0; i < out_lc; i++)
+  {
+    p_a = (xtfloatx2 *)&p_inp1[i * in_lc];
+    p_b = (xtfloatx2 *)p_inp2;
+    p_c = (xtfloatx2 *)&p_out[i * in_lc];
+    if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_b)&7) == 0) && ((((unsigned)p_c)&7) == 0))
+    {
+      for(j = 0; j < num_simd2_ops; j++)
+      {
+        XT_LSX2IP(x1, p_a, 2 * sizeof(FLOAT32));
+        XT_LSX2IP(x2, p_b, 2 * sizeof(FLOAT32));
+        y = XT_MIN_SX2(x2, x1);
+        XT_SSX2IP(y, p_c, 2 * sizeof(FLOAT32)); 
+      }
+    }
+    else
+    {
+      ae_valign vinp1, vinp2, out_a = AE_ZALIGN64();
+      vinp1 = XT_LASX2PP(p_a);
+      vinp2 = XT_LASX2PP(p_b);
+      for(j = 0; j < num_simd2_ops; j++)
+      {
+        XT_LASX2IP(x1, vinp1, p_a);
+        XT_LASX2IP(x2, vinp2, p_b);
+        y = XT_MIN_SX2(x2, x1);
+        XT_SASX2IP(y, out_a, p_c); 
+      }
+      XT_SASX2POSFP(out_a, (xtfloatx2 *)p_c);
+    }
+    if(num_scalar_ops !=0)
+    {
+      XT_LSIP(a0, (xtfloat *)p_a, sizeof(FLOAT32));
+      XT_LSIP(b0, (xtfloat *)p_b, sizeof(FLOAT32));
+      c0 = XT_MIN_S(b0, a0);   
+      XT_SSI(c0, (xtfloat *)p_c, 0);
+    }
+  }
+}
+
+static void internal_elm_minimum_broadcast_f32xf32_f32(FLOAT32 * __restrict__ p_out,
+                    const    FLOAT32 * __restrict__ p_inp1,
+                    const    FLOAT32 * __restrict__ p_inp2,
+                             WORD32  num_elm,
+                             xtbool  sign_flag)
+{
+  int i;
+  xtfloatx2  * __restrict__ p_a = (xtfloatx2 *)p_inp1;
+  xtfloatx2  * __restrict__ p_b = (xtfloatx2 *)p_inp2; 
+  xtfloatx2  *__restrict__  p_c =  (xtfloatx2 *)p_out;
+
+  const int num_simd2_ops = num_elm >> 1;
+  const int num_scalar_ops = num_elm & 1;
+
+  xtfloat a0_7, out;
+  xtfloatx2 x1, x2, y;
+  x2 = XT_LSI((xtfloat *)p_b, 0);
+        
+  if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_c)&7) == 0))
+  {
+    for(i=0; i<num_simd2_ops; i++)
+    {
+      XT_LSX2IP(x1, p_a, 2 * sizeof(FLOAT32));
+      y = XT_MIN_SX2(x2, x1);
+      XT_SSX2IP(y, p_c, 2 * sizeof(FLOAT32)); 
+    }
+  }
+  else
+  {
+    ae_valign inp1_a, out_a;
+    inp1_a = XT_LASX2PP(p_a);
+    out_a = AE_ZALIGN64();      
+    for(i=0; i<num_simd2_ops; i++)
+    {
+      XT_LASX2IP(x1, inp1_a, p_a);
+      y = XT_MIN_SX2(x2, x1);
+      XT_SASX2IP(y, out_a, p_c);
+    }
+    XT_SASX2POSFP(out_a, (xtfloatx2 *)p_c);   
+  }  
+  if(num_scalar_ops !=0)
+  {
+    XT_LSIP(a0_7, (xtfloat *)p_a, sizeof(FLOAT32));
+    out = XT_MIN_S(x2, a0_7);   
+    XT_SSI(out, (xtfloat *)p_c, 0);
+  }
+}
+#endif
+
+#if !HAVE_VFPU
+DISCARD_FUN_FOR_NONVOID_RETURN(
+             WORD32, xa_nn_elm_minimum_broadcast_4D_f32xf32_f32,
+             (
+                      FLOAT32 * p_out,
+                      const WORD32 *const p_out_shape,
+                      const FLOAT32 * p_inp1,
+                      const WORD32 *const p_inp1_shape,
+                      const FLOAT32 * p_inp2,
+                      const WORD32 *const p_inp2_shape
+              )
+           )
+#else           
+WORD32 xa_nn_elm_minimum_broadcast_4D_f32xf32_f32(FLOAT32 * __restrict__ p_out,
+                      const WORD32 *const p_out_shape,
+                      const FLOAT32 * __restrict__ p_inp1,
+                      const WORD32 *const p_inp1_shape,
+                      const FLOAT32 * __restrict__ p_inp2,
+                      const WORD32 *const p_inp2_shape)
+{
+  /* NULL pointer checks */
+  XA_NNLIB_ARG_CHK_PTR(p_out, -1);
+  XA_NNLIB_ARG_CHK_PTR(p_inp1, -1);
+  XA_NNLIB_ARG_CHK_PTR(p_inp2, -1);
+  XA_NNLIB_ARG_CHK_PTR(p_out_shape, -1);
+  XA_NNLIB_ARG_CHK_PTR(p_inp1_shape, -1);
+  XA_NNLIB_ARG_CHK_PTR(p_inp2_shape, -1);
+  /* Pointer alignment checks */
+  XA_NNLIB_ARG_CHK_ALIGN(p_out, sizeof(FLOAT32), -1);
+  XA_NNLIB_ARG_CHK_ALIGN(p_inp1, sizeof(FLOAT32), -1);
+  XA_NNLIB_ARG_CHK_ALIGN(p_inp2, sizeof(FLOAT32), -1);
+  XA_NNLIB_ARG_CHK_ALIGN(p_out_shape, sizeof(WORD32), -1);
+  XA_NNLIB_ARG_CHK_ALIGN(p_inp1_shape, sizeof(WORD32), -1);
+  XA_NNLIB_ARG_CHK_ALIGN(p_inp2_shape, sizeof(WORD32), -1);
+
+  /* Check shapes */
+  int i;
+  xtbool sign_flag;
+  for(i = 0; i < 4; i++)
+  {
+    if((p_inp1_shape[i] != p_inp2_shape[i] && p_inp1_shape[i] != 1 && p_inp2_shape[i] != 1) ||
+       (p_out_shape[i] != (p_inp1_shape[i] > p_inp2_shape[i] ? p_inp1_shape[i] : p_inp2_shape[i])))
+    {
+      return -1;
+    }
+  }
+
+  WORD32 inp1_strides[4], inp2_strides[4];
+  inp1_strides[3] = 1;
+  inp2_strides[3] = 1;
+  for(i = 2; i >= 0; i--)
+  {
+    ae_int32x2 d_str, d_shape;
+    d_str = AE_MOVDA32X2(inp1_strides[i + 1], inp2_strides[i + 1]);
+    d_shape = AE_MOVDA32X2(p_inp1_shape[i + 1], p_inp2_shape[i + 1]);
+    d_str = AE_MULP32X2(d_str, d_shape);
+    inp1_strides[i] = AE_MOVAD32_H(d_str);
+    inp2_strides[i] = AE_MOVAD32_L(d_str);
+  }
+
+  int need_broadcast = 0;
+  int inp1_const = 1, inp2_const = 1;
+  for(i = 0; i < 4; i++)
+  {
+    if(p_inp1_shape[i] != p_inp2_shape[i])
+    {
+      if(p_inp1_shape[i] == 1)
+        inp1_strides[i] = 0;
+      else
+        inp2_strides[i] = 0;
+
+      need_broadcast = 1;
+    }
+    if(p_inp1_shape[i] != 1)
+      inp1_const &= 0;
+    if(p_inp2_shape[i] != 1)
+      inp2_const &= 0;
+  }
+  int itr0, itr1, itr2;
+
+  FLOAT32 *p_out_tmp = p_out;
+  const FLOAT32 *__restrict__ p_inp1_tmp = p_inp1;
+  const FLOAT32 *__restrict__ p_inp2_tmp = p_inp2;
+  if(need_broadcast == 0)
+  {
+    sign_flag = 0;
+    internal_elm_minimum_broadcast_2D_f32xf32_f32(
+                p_out,
+                p_inp1,
+                p_inp2,
+                1,
+                p_out_shape[0] * inp1_strides[0],
+                sign_flag);
+  }
+  else if(inp1_strides[3] == inp2_strides[3])
+  {
+    WORD32 in_lc, out_lc;
+    sign_flag = 0;
+    in_lc = p_out_shape[2] * p_out_shape[3];
+    out_lc = 1;
+    if(inp1_strides[2] == 0)
+    {
+      const FLOAT32 *tmp;
+      tmp = p_inp1_tmp;   p_inp1_tmp = p_inp2_tmp;    p_inp2_tmp = tmp;
+      sign_flag = 1;
+      int tmp_strides[2];
+      tmp_strides[0] = inp1_strides[0];
+      tmp_strides[1] = inp1_strides[1];
+
+      inp1_strides[0] = inp2_strides[0];
+      inp1_strides[1] = inp2_strides[1];
+
+      inp2_strides[0] = tmp_strides[0];
+      inp2_strides[1] = tmp_strides[1];
+      in_lc = p_out_shape[3];
+      out_lc = p_out_shape[2];
+    }
+    else if(inp2_strides[2] == 0)
+    {
+      in_lc = p_out_shape[3];
+      out_lc = p_out_shape[2];
+    }
+
+    for(itr0 = 0; itr0 < p_out_shape[0]; itr0++)
+    {
+      const FLOAT32 *__restrict__ p_inp1_tmp0 = p_inp1_tmp;
+      const FLOAT32 *__restrict__ p_inp2_tmp0 = p_inp2_tmp;
+      for(itr1 = 0; itr1 < p_out_shape[1]; itr1++)
+      {
+        internal_elm_minimum_broadcast_2D_f32xf32_f32(
+            p_out_tmp,
+            p_inp1_tmp0,
+            p_inp2_tmp0,
+            out_lc,
+            in_lc,
+            sign_flag);
+        p_out_tmp += in_lc * out_lc;
+        p_inp1_tmp0 += inp1_strides[1];
+        p_inp2_tmp0 += inp2_strides[1];
+      }
+      p_inp1_tmp += inp1_strides[0];
+      p_inp2_tmp += inp2_strides[0];
+    }
+  }
+  else if(inp1_const == 1 || inp2_const == 1)
+  {
+    sign_flag = 0;
+    if(inp1_strides[3] == 0)
+    {
+      sign_flag = 1;
+      const FLOAT32 *tmp;
+      tmp = p_inp1_tmp;   p_inp1_tmp = p_inp2_tmp;    p_inp2_tmp = tmp;
+    }
+    internal_elm_minimum_broadcast_f32xf32_f32(
+        p_out_tmp,
+        p_inp1_tmp,
+        p_inp2_tmp,
+        p_out_shape[0] * p_out_shape[1] * p_out_shape[2] * p_out_shape[3],
+        sign_flag);
+  }
+  else
+  {
+    sign_flag = 0;
+    if(inp1_strides[3] == 0)
+    {
+      const FLOAT32 *tmp;
+      tmp = p_inp1_tmp;   p_inp1_tmp = p_inp2_tmp;    p_inp2_tmp = tmp;
+      sign_flag = 1;
+      int tmp_strides[3];
+      tmp_strides[0] = inp1_strides[0];
+      tmp_strides[1] = inp1_strides[1];
+      tmp_strides[2] = inp1_strides[2];
+
+      inp1_strides[0] = inp2_strides[0];
+      inp1_strides[1] = inp2_strides[1];
+      inp1_strides[2] = inp2_strides[2];
+
+      inp2_strides[0] = tmp_strides[0];
+      inp2_strides[1] = tmp_strides[1];
+      inp2_strides[2] = tmp_strides[2];
+    }
+    for(itr0 = 0; itr0 < p_out_shape[0]; itr0++)
+    {
+      const FLOAT32 *__restrict__ p_inp1_tmp0 = p_inp1_tmp;
+      const FLOAT32 *__restrict__ p_inp2_tmp0 = p_inp2_tmp;
+      for(itr1 = 0; itr1 < p_out_shape[1]; itr1++)
+      {
+        const FLOAT32 *__restrict__ p_inp1_tmp1 = p_inp1_tmp0;
+        const FLOAT32 *__restrict__ p_inp2_tmp1 = p_inp2_tmp0;
+        for(itr2 = 0; itr2 < p_out_shape[2]; itr2++)
+        {
+          {
+            internal_elm_minimum_broadcast_f32xf32_f32(
+                p_out_tmp,
+                p_inp1_tmp1,
+                p_inp2_tmp1,
+                p_out_shape[3], 
+                sign_flag);
+          }
+          p_out_tmp += p_out_shape[3];
+          p_inp1_tmp1 += inp1_strides[2];
+          p_inp2_tmp1 += inp2_strides[2];
+        }
+        p_inp1_tmp0 += inp1_strides[1];
+        p_inp2_tmp0 += inp2_strides[1];
+      }
+      p_inp1_tmp += inp1_strides[0];
+      p_inp2_tmp += inp2_strides[0];
+    }
+  }
+  return 0;
+}
+#endif
\ No newline at end of file
diff --git a/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_pow_f32.c b/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_pow_f32.c
new file mode 100644
index 0000000000..4dcec52f97
--- /dev/null
+++ b/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_pow_f32.c
@@ -0,0 +1,1151 @@
+/* ------------------------------------------------------------------------ */
+/* Copyright (c) 2018 by Cadence Design Systems, Inc. ALL RIGHTS RESERVED.  */
+/* These coded instructions, statements, and computer programs ("Cadence    */
+/* Libraries") are the copyrighted works of Cadence Design Systems Inc.	    */
+/* Cadence IP is licensed for use with Cadence processor cores only and     */
+/* must not be used for any other processors and platforms. Your use of the */
+/* Cadence Libraries is subject to the terms of the license agreement you   */
+/* have entered into with Cadence Design Systems, or a sublicense granted   */
+/* to you by a direct Cadence licensee.                                     */
+/* ------------------------------------------------------------------------ */
+/*  IntegrIT, Ltd.   www.integrIT.com, info@integrIT.com                    */
+/*                                                                          */
+/* DSP Library                                                              */
+/*                                                                          */
+/* This library contains copyrighted materials, trade secrets and other     */
+/* proprietary information of IntegrIT, Ltd. This software is licensed for  */
+/* use with Cadence processor cores only and must not be used for any other */
+/* processors and platforms. The license to use these sources was given to  */
+/* Cadence, Inc. under Terms and Condition of a Software License Agreement  */
+/* between Cadence, Inc. and IntegrIT, Ltd.                                 */ 
+/* ------------------------------------------------------------------------ */
+/*          Copyright (C) 2015-2018 IntegrIT, Limited.                      */
+/*                      All Rights Reserved.                                */
+/* ------------------------------------------------------------------------ */
+/*
+  NatureDSP Signal Processing Library. Vector mathematics
+    Vector operations
+    code optimized for HiFi4 core
+  IntegrIT, 2006-2018
+*/
+
+#include "../include/NatureDSP_Signal_math.h"
+#include "NatureDSP_types.h"
+#include "xa_nn_common.h"
+
+/* Common helper macros. */
+#include "xa_nnlib_common_fpu.h"
+
+#include "xa_nnlib_common.h"
+/* Constant tables. */
+
+const union ufloat32uint32 ALIGN(8) xa_nnlib_pow2f_coef[] =
+{
+  { 0x39222a65 },
+  { 0x3aaf931c },
+  { 0x3c1d94fc },
+  { 0x3d63578a },
+  { 0x3e75fdf0 },
+  { 0x3f317218 },
+  { 0x3f800000 }
+
+ //{ 0x3aaf931b },
+ //{ 0x3c1e7220 },
+ //{ 0x3d63578a },
+ //{ 0x3e75fcc9 },
+ //{ 0x3f317218 },
+ //{ 0x3f800000 }
+
+};
+
+const union ufloat32uint32 ALIGN(8) xa_nnlib_log2f_coef[] =
+{
+  { 0x3d726a49 },
+  { 0x3dd91c88 },
+  { 0x3ddde76c },
+  { 0x3de21e63 },
+  { 0x3dfe600b },
+  { 0x3e124679 },
+  { 0x3e2ab2f1 },
+  { 0x3e4ccd1b },
+  { 0x3e7fffde },
+  { 0x3eaaaaaa },
+  { 0x3f000000 },
+  { 0x3f800000 },
+  /* log2(e) */
+  { 0x3fb8aa3b }, /* 1.4426950216      */
+  { 0x32a57060 }  /* 1.9259629891e-008 */
+};
+
+const union ufloat32uint32 xa_nnlib_pow_plusInff  ={0x7f800000};
+
+const union ufloat32uint32 xa_nnlib_pow_qNaNf       = { 0x7fc00000 };
+
+#define MIN(a,b)   ( (a)<(b) ? (a) : (b) )
+#define MAX(a,b)   ( (a)>(b) ? (a) : (b) )
+
+/*-------------------------------------------------------------------------
+  Power function
+  These routines calculate power function for 32-bit fixed-point numbers or 
+  floating point numbers. 
+  For the fixed point API, The  base is represented in Q31, the exponent 
+  is represented in Q6.25. Results are represented as normalized fixed point
+  number with separate mantissa in Q31 and exponent.
+
+  Precision:
+  32x32  32-bit inputs, 32-bit outputs
+  f      floating point input, floating point output
+
+  Accuracy: 
+  2 ULP for fixed point API
+  2 ULP under condition that |y|<=100
+
+  Notes:
+1. Scalar floating point raise  to a power functions conform to ANSI C requirements on 
+   standard math library functions in respect to treatment of errno and floating-
+   point exceptions. Vectorized function does not touch errno and may raise or not raise 
+   floating point exceptions.
+2. For floating point API, If x<0 is finite, y is finite and not an integer value, 
+   then the respective result z is set to NaN
+3. For fixed point API, function returns zero for all non-positive x. Fixed point 
+   functions never touch errno
+
+    Special cases:
+          x   |   y    | Result |  Extra Conditions    
+      --------+--------+--------+---------------------
+      floating point API
+      --------+--------+--------+---------------------
+        +/-0  | y      | +/-inf | odd y<0
+        +/-0  | y      | +inf   | even y<0
+        +/-0  | y      | +/-0   | odd y>0
+        +/-0  | y      | 0      | even y>0
+        +/-1  | +/-inf | 1      | 
+        1     | y      | 1      | any y including NaN
+        x     | +/-0   | 1      | any x including NaN
+        x     | y      | NaN    | finite x<0 and finite 
+              |        |        | non-integer y (see 
+              |        |        | note 2)
+        x     | -inf   | +inf   | |x|<1
+        x     | -inf   | 0      | |x|>1
+        x     | +inf   | 0      | |x|<1
+        x     | +inf   | +inf   | |x|>1
+        -inf  | y      | -0     | y an odd integer <0
+        -inf  | y      | 0      | y<0 and not an odd 
+              |        |        | integer
+        -inf  | y      | -inf   | y an odd integer >0
+        -inf  | y      | +inf   | y>0 and not an odd 
+              |        |        | integer
+        +inf  | y      | 0      | y<0
+        +inf  | y      | +inf   | y>0
+      --------+--------+--------+---------------------
+      fixed point API
+      --------+--------+--------+---------------------
+         x    | y      | 0      | x<=0
+      --------+--------+--------+---------------------
+
+  Input:
+  x[N]  input data,Q0.31 or floating point
+  y[N]  input data,Q6.25 or floating point
+  N     length of vectors
+  Output (fixed point API):
+  m[N]  mantissa of output, Q31 
+  e[N]  exponent of output  
+  Output (floating point API):
+  z[N]  results: floating point
+
+  Restriction:
+  z,x,y,m should not overlap
+-------------------------------------------------------------------------*/
+
+#if !HAVE_VFPU && !HAVE_FPU
+DISCARD_FUN(void, xa_nn_elm_pow_f32, (FLOAT32 * restrict z, const FLOAT32 * restrict y, const FLOAT32 * restrict x, WORD32 N))
+#elif HAVE_VFPU
+#define sz_f32    (int)sizeof(FLOAT32)
+static void mypowf(FLOAT32 * scr,
+                  FLOAT32 * restrict z, 
+            const FLOAT32 * restrict x, 
+            const FLOAT32 * restrict y, 
+            WORD32 N )
+{
+  /* Table of different constants used in computations */
+  static const int32_t c_tbl[] =
+  {
+    -126,
+    -150,
+    (int32_t)0x007FFFFF,/* max denormalized floating-point number / mantissa mask */
+    (int32_t)0x4B800000,/* 2^24 */
+    (int32_t)0x3F3504F3,/* sqrt(0.5) */
+    (int32_t)0x3F000000,/*  0.5 */
+    (int32_t)0xBF000000,/* -0.5 */
+    -252,
+    254
+  };
+  int n;
+  const xtfloatx2     *          pX;
+  const xtfloatx2     *          pY;
+
+  const xtfloatx2     * restrict S_rd;
+        xtfloatx2     * restrict S_wr;
+        xtfloatx2     * restrict pZ;
+  const ae_int32      * restrict TBL;
+  const  xtfloat      * restrict TBL_LOG2;
+  const  xtfloat      * restrict TBL_POW2;
+  xtfloatx2 x0, y0, z0, t0, t1, ef0;
+  xtfloatx2 c2f, c3f, c4f;
+  xtfloatx2 _0, _1, half;
+  ae_int32x2 c0i, c1i, c5i, c7i, c8i;
+  ae_int32x2 e0, xi0, yi0, ex0;
+  xtbool2 bsx, bsy, bdenorm, bsmall;
+  ae_valign aX, aY, aZ;
+
+  /* overall number of blocks; number of values in the current block */
+  int blkLen;
+  /* Block size, blkLen <= blkSize */
+  const int blkSize = MAX_ALLOCA_SZ / (3*sz_f32);
+
+
+  if (N <= 0) return;
+
+  NASSERT(N % 2 == 0);
+  NASSERT_ALIGN16(scr);
+
+  /*
+  * Data are processed in blocks of scratch area size. Further, the algorithm
+  * implementation is splitted in order to feed the optimizing compiler with a
+  * few loops of managable size.
+  */
+
+
+  blkLen = 0;
+  TBL = (const ae_int32 *)c_tbl;
+  for (; N>0; N -= blkLen, x += blkSize, y += blkSize, z += blkSize)
+  {
+    blkLen = XT_MIN(N, blkSize);
+    _0 = 0.0f;
+    _1 = (1.0f);
+    half = (0.5f);
+    {
+      pX = (const xtfloatx2*)x;
+      S_wr = (xtfloatx2*)scr;
+      aX = AE_LA64_PP(pX);
+      for (n = 0; n<(blkLen >> 1); n++)
+      {
+        XT_LASX2IP(x0, aX, pX);
+
+        x0 = XT_ABS_SX2(x0);
+        c0i = AE_L32_I(TBL, 0 * 4); /*-126*/
+        c1i = AE_L32_I(TBL, 1 * 4); /*-150*/
+        c2f = XT_LSI((xtfloat*)TBL, 2 * 4);
+        c3f = XT_LSI((xtfloat*)TBL, 3 * 4);
+        /* process denormalized values */
+        bdenorm = XT_OLE_SX2(x0, c2f);
+        t0 = XT_MUL_SX2(x0, c3f);
+        XT_MOVT_SX2(x0, t0, bdenorm);
+        e0 = c0i;
+        AE_MOVT32X2(e0, c1i, bdenorm);
+        /* extract exponent */
+        xi0 = XT_AE_MOVINT32X2_FROMXTFLOATX2(x0);
+        ex0 = AE_SRLI32(xi0, 23);
+        e0 = AE_ADD32(e0, ex0);
+        /* extract mantissa */
+        ex0 = XT_AE_MOVINT32X2_FROMXTFLOATX2(c2f);/* load mantissa mask */ //!!!!!!!!!!!!!
+        c5i = AE_L32_I(TBL, 5 * 4);/*  0.5 */
+        xi0 = AE_AND32(xi0, ex0);
+        xi0 = AE_OR32(xi0, c5i);
+        x0 = XT_AE_MOVXTFLOATX2_FROMINT32X2(xi0);
+        /* adjust the mantissa to range [ sqrt(0.5) ; sqrt(2.0) ) */
+        c4f = XT_LSI((xtfloat*)TBL, 4 * 4);
+        bsmall = XT_OLT_SX2(x0, c4f);
+        t0 = XT_ADD_SX2(x0, x0);
+        ex0 = AE_SUB32(e0, 1);
+        XT_MOVT_SX2(x0, t0, bsmall);
+        AE_MOVT32X2(e0, ex0, bsmall);
+        x0 = XT_SUB_SX2(_1, x0); //!!!
+        ef0 = XT_FLOAT_SX2(e0, 0); //!!!
+        XT_SSX2IP(x0, S_wr, 2 * sz_f32);
+        XT_SSX2IP(ef0, S_wr, 2*2 * sz_f32);
+      }
+    }
+    __Pragma("no_reorder");
+    /* */
+    {
+      xtfloatx2 p0, p1, p2, p3, p4, p5, p6, p7, p8, p9;
+      xtfloatx2 p10, p11, p12, p13;
+      xtfloatx2 t2, w0, w1;
+      S_wr = (      xtfloatx2*)scr+2;
+      S_rd = (const xtfloatx2*)scr;
+      TBL_LOG2 = (const xtfloat *)xa_nnlib_log2f_coef;
+      for (n = 0; n<(blkLen >> 1); n++)
+      {
+        XT_LSX2IP(x0, S_rd, 3*2 * sz_f32);
+        //XT_LSX2IP(ef0, S_rd, 2 * sz_f32);
+
+        /* evaluate polynomial approximation */
+        /* Load table of coefficients */
+
+        p0 = XT_LSI(TBL_LOG2, 0 * 4);
+        p1 = XT_LSI(TBL_LOG2, 1 * 4);
+        p2 = XT_LSI(TBL_LOG2, 2 * 4);
+        p3 = XT_LSI(TBL_LOG2, 3 * 4);
+        p4 = XT_LSI(TBL_LOG2, 4 * 4);
+        p5 = XT_LSI(TBL_LOG2, 5 * 4);
+        p6 = XT_LSI(TBL_LOG2, 6 * 4);
+        p7 = XT_LSI(TBL_LOG2, 7 * 4);
+        p8 = XT_LSX(TBL_LOG2, 8 * 4);
+        p9 = XT_LSX(TBL_LOG2, 9 * 4);
+        
+        XT_MADD_SX2(p1, x0, p0);
+        XT_MADD_SX2(p2, x0, p1);
+        XT_MADD_SX2(p3, x0, p2);
+        XT_MADD_SX2(p4, x0, p3);
+        XT_MADD_SX2(p5, x0, p4);
+        XT_MADD_SX2(p6, x0, p5);
+        XT_MADD_SX2(p7, x0, p6);
+        XT_MADD_SX2(p8, x0, p7);
+        XT_MADD_SX2(p9, x0, p8);
+        t2 = p9;
+        XT_SSX2IP(t2, S_wr, 3*2 * sz_f32);
+      }
+      S_wr = (xtfloatx2*)scr;
+      S_rd = (const xtfloatx2*)scr;
+      for (n = 0; n<(blkLen >> 1); n++)
+      {
+        p10 = XT_LSX(TBL_LOG2, 10 * 4);
+        p11 = XT_LSX(TBL_LOG2, 11 * 4);
+        p12 = XT_LSX(TBL_LOG2, 12 * 4);
+        p13 = XT_LSX(TBL_LOG2, 13 * 4);
+
+        XT_LSX2IP(x0, S_rd, 2 * sz_f32);
+        XT_LSX2IP(ef0, S_rd, 2 * sz_f32);
+        XT_LSX2IP(t2, S_rd, 2 * sz_f32);
+        /* next coefficients are computed in extended precision */
+        t0 = XT_MUL_SX2(x0, t2); t1 = t0;
+        XT_MSUB_SX2(t1, x0, t2);
+        w0 = XT_ADD_SX2(t0, p10);
+        w1 = XT_SUB_SX2(w0, p10);
+        w1 = XT_SUB_SX2(t0, w1);
+        w1 = XT_SUB_SX2(w1, t1);
+        t0 = w0; t1 = w1;
+        w0 = XT_MUL_SX2(x0, t0); w1 = w0;
+        XT_MSUB_SX2(w1, x0, t0); t0 = w0;
+        XT_MSUB_SX2(w1, x0, t1); t1 = w1;
+        w0 = XT_ADD_SX2(t0, p11);
+        w1 = XT_SUB_SX2(w0, p11);
+        w1 = XT_SUB_SX2(t0, w1);
+        w1 = XT_SUB_SX2(w1, t1);
+        t0 = w0; t1 = w1;
+        x0 = XT_NEG_SX2(x0);
+        w0 = XT_MUL_SX2(x0, t0); w1 = w0;
+        XT_MSUB_SX2(w1, x0, t0); t0 = w0;
+        XT_MSUB_SX2(w1, x0, t1); t1 = w1;
+        /* multiply by log2(e) */
+        w0 = XT_MUL_SX2(t0, p12); w1 = w0;
+        XT_MSUB_SX2(w1, t0, p12);
+        XT_MADD_SX2(w1, t1, p12);
+        XT_MSUB_SX2(w1, t0, p13);
+        t0 = w0; t1 = w1;
+        /* add exponent */
+        w0 = XT_ADD_SX2(t0, ef0);
+        w1 = XT_SUB_SX2(w0, ef0);
+        w1 = XT_SUB_SX2(t0, w1);
+        t1 = XT_SUB_SX2(w1, t1);//!!!!
+        t0 = w0; // !!!!!
+        XT_SSX2IP(t0, S_wr, 2 * sz_f32);
+        XT_SSX2IP(t1, S_wr, 2*2 * sz_f32);
+      }    
+    }
+    __Pragma("no_reorder");
+    /* */
+    {
+      xtfloatx2 xy, dxy, c0, c1;
+      xtfloatx2 p0, p1, p2, p3, p4, p5, p6;
+      S_wr = (      xtfloatx2*)scr+2;
+      S_rd = (const xtfloatx2*)scr;
+      TBL_POW2 = (const xtfloat *)xa_nnlib_pow2f_coef;
+      pY = (const xtfloatx2*)y;
+      aY = AE_LA64_PP(pY);
+      for (n = 0; n<(blkLen >> 1); n++)
+      {
+        XT_LSX2IP(t0, S_rd, 2 * sz_f32);
+        XT_LSX2IP(t1, S_rd, 2*2 * sz_f32);
+
+        XT_LASX2IP(y0, aY, pY);
+        /* compute y*log2(x) and separate result into integer and fractional parts */
+        xy = XT_FIROUND_SX2(XT_MUL_SX2(y0, t0));
+        dxy = XT_NEG_SX2(xy);
+        XT_MADD_SX2(dxy, y0, t0);
+        XT_MADD_SX2(dxy, y0, t1);
+        dxy = XT_MIN_SX2(dxy, (xtfloatx2)1.0f);
+        dxy = XT_MAX_SX2(dxy, (xtfloatx2)-1.0f);
+        /* compute 2^fract */
+        p0 = XT_LSI(TBL_POW2, 0 * 4);
+        p1 = XT_LSI(TBL_POW2, 1 * 4);
+        p2 = XT_LSI(TBL_POW2, 2 * 4);
+        p3 = XT_LSI(TBL_POW2, 3 * 4);
+        p4 = XT_LSI(TBL_POW2, 4 * 4);
+        
+        /* NOTE: do not change the order of computations and way of polynomial decomposition ! */
+        XT_MADD_SX2(p1, dxy, p0);
+        XT_MADD_SX2(p2, dxy, p1);
+        XT_MADD_SX2(p3, dxy, p2);
+        XT_MADD_SX2(p4, dxy, p3);
+        XT_SSX2IP(p4, S_wr, 3*2 * sz_f32);
+      }
+      __Pragma("no_reorder");
+      S_wr = (xtfloatx2*)scr;
+      S_rd = (const xtfloatx2*)scr;
+      TBL_POW2 = (const xtfloat *)xa_nnlib_pow2f_coef;
+      pY = (const xtfloatx2*)y;
+      aY = AE_LA64_PP(pY);
+      for (n = 0; n<(blkLen >> 1); n++)
+      {
+
+        XT_LSX2IP(t0, S_rd, 2 * sz_f32);
+        XT_LSX2IP(t1, S_rd, 2 * sz_f32);
+        XT_LSX2IP(p4, S_rd, 2 * sz_f32);       
+        p5 = XT_LSI(TBL_POW2, 5 * 4);
+        p6 = XT_LSI(TBL_POW2, 6 * 4);
+        XT_LASX2IP(y0, aY, pY);
+        /* compute y*log2(x) and separate result into integer and fractional parts */
+        xy = XT_FIROUND_SX2(XT_MUL_SX2(y0, t0));
+        dxy = XT_NEG_SX2(xy);
+        XT_MADD_SX2(dxy, y0, t0);
+        XT_MADD_SX2(dxy, y0, t1);
+        dxy = XT_MIN_SX2(dxy, (xtfloatx2)1.0f);
+        dxy = XT_MAX_SX2(dxy, (xtfloatx2)-1.0f);
+        XT_MADD_SX2(p5, dxy, p4);
+        XT_MADD_SX2(p6, dxy, p5);
+        z0 = p6;
+        /* apply integer part */
+        e0 = XT_TRUNC_SX2(xy, 0);
+        c7i = AE_L32_I(TBL, 7 * 4);/* -252 */
+        c8i = AE_L32_X(TBL, 8 * 4);/* 254 */
+        e0 = AE_MAX32(e0, c7i);
+        e0 = AE_MIN32(e0, c8i);
+        e0 = AE_ADD32(e0, c8i);
+        ex0 = AE_SRAI32(e0, 1);
+        e0 = AE_SUB32(e0, ex0);
+        ex0 = AE_SLLI32(ex0, 23);
+        e0 = AE_SLLI32(e0, 23);
+        c0 = XT_AE_MOVXTFLOATX2_FROMINT32X2(e0);
+        c1 = XT_AE_MOVXTFLOATX2_FROMINT32X2(ex0);
+        z0 = XT_MUL_SX2(z0, c1);
+        z0 = XT_MUL_SX2(z0, c0); //!!!!!!!!!!!!
+        XT_SSX2IP(z0, S_wr, 2 * sz_f32);
+      }
+    }
+    __Pragma("no_reorder");
+    /* */
+    {
+      xtbool2 b_yint, b_e0, b0, b_notspec;
+      xtbool2 b_yeqz, b_yinf, b_xeqz, b_xeq1, b_xinf;
+      xtbool2 b_NaN1, b_NaN2, b_one, b_Inf, b_zero;
+      uint32_t b0i, b1i;
+      uint32_t yeqz, yinf, xeqz, xeq1, xinf, sx, sy, yint;
+      uint32_t one, NaN1, Inf, zero;
+      xtfloatx2 xabs, spec;
+      ae_int32x2 sgn, zi0;
+
+      S_rd = (const xtfloatx2*)scr;
+      pY = (const xtfloatx2*)y;
+      pX = (const xtfloatx2*)x;
+      pZ = (      xtfloatx2*)z;
+      aY = AE_LA64_PP(pY);
+      aX = AE_LA64_PP(pX);
+      aZ = AE_ZALIGN64();
+      for (n = 0; n<(blkLen >> 1); n++)
+      {
+        XT_LSX2IP(z0, S_rd, 2 * sz_f32);
+        XT_LASX2IP(x0, aX, pX);
+        XT_LASX2IP(y0, aY, pY);
+        /* Take sign of x and y */
+        xi0 = XT_AE_MOVINT32X2_FROMXTFLOATX2(x0);
+        yi0 = XT_AE_MOVINT32X2_FROMXTFLOATX2(y0);
+        bsx = XT_OLT_SX2(xi0, (xtfloatx2)0.0f);
+        bsy = XT_OLT_SX2(yi0, (xtfloatx2)0.0f);
+
+        xabs = XT_ABS_SX2(x0);
+        /* check if y is integer */
+        t0 = XT_FITRUNC_SX2(y0);
+        b_yint = XT_OEQ_SX2(t0, y0);
+
+        /* check if y is odd */
+        e0 = XT_TRUNC_SX2(y0, 0); //temp0
+        b_e0 = AE_EQ32(e0, MAX_INT32);//~b_tmp0
+        b0i = AE_MOVAB2(b_e0);
+        b1i = AE_MOVAB2(b_yint);
+        b0i = b1i&(~b0i);
+        b0 = AE_MOVBA2(b0i);
+        AE_MOVF32X2(e0, AE_ZERO32(), b0);
+        e0 = AE_SLLI32(e0, 31);
+        sgn = AE_AND32(e0, xi0);
+        /* process special numbers */
+        b_yeqz = XT_OEQ_SX2((xtfloatx2)0.0f, y0);            /*  y ==0      */
+        b_yinf = XT_OEQ_SX2(XT_ABS_SX2(y0), xa_nnlib_pow_plusInff.f);     /* |y|==Inf    */
+        b_xeqz = XT_OEQ_SX2(x0, (xtfloatx2)0.0f);            /*  x ==0      */
+        b_xeq1 = XT_OEQ_SX2(xabs, (xtfloatx2)1.0f);          /* |x|==1      */
+        b_xinf = XT_OEQ_SX2(xabs, xa_nnlib_pow_plusInff.f);               /* |x|==INF    */
+
+        yint = AE_MOVAB2(b_yint);
+        yeqz = AE_MOVAB2(b_yeqz);
+        yinf = AE_MOVAB2(b_yinf);
+        xeqz = AE_MOVAB2(b_xeqz);
+        xeq1 = AE_MOVAB2(b_xeq1);
+        xinf = AE_MOVAB2(b_xinf);
+        sx = AE_MOVAB2(bsx);
+        sy = AE_MOVAB2(bsy);
+        one = xeq1 & (yinf | (~sx));  /* |x|==1 && ( |y|==Inf || x>0 )                       */
+        one = one | yeqz;           /* ( |x|==1 && ( |y|==Inf || x>0 ) ) || y==0 --> z=1.0 */
+        NaN1 = sx&(~yint);          /* x<0 && y is not an integer --> z=NaN                */
+        Inf = xinf&(~sy);          /* x==INF && y>0 --> z=INF */
+        Inf = Inf | (xeqz & sy);    /* x==0   && y<0 --> z=INF */
+        zero = xeqz &(~sy);         /* x==0   && y>0 --> z=0.0 */
+        zero = zero | (xinf & sy);  /* x==INF && y<0 --> z=0.0 */
+
+        b_NaN1 = AE_MOVBA2(NaN1);
+        b_NaN2 = XT_UN_SX2(x0, y0);         /* isnan(x) || isnan(y) --> z=NaN                      */
+        b_one = AE_MOVBA2(one);
+        b_Inf = AE_MOVBA2(Inf);
+        b_zero = AE_MOVBA2(zero);
+
+        /* Save special numbers and mask for special numbers */
+        spec = (xtfloatx2)xa_nnlib_pow_qNaNf.f;
+        XT_MOVF_SX2(spec, half, b_NaN1);
+        XT_MOVT_SX2(spec, _0, b_zero);
+        XT_MOVT_SX2(spec, xa_nnlib_pow_plusInff.f, b_Inf);
+        XT_MOVT_SX2(spec, xa_nnlib_pow_qNaNf.f, b_NaN2);
+        XT_MOVT_SX2(spec, _1, b_one);
+
+        b_notspec = XT_OEQ_SX2(spec, half);
+        /* Replace result with special numbers if needed */
+        XT_MOVF_SX2(z0, spec, b_notspec);
+        /* Restore sign and store result */
+        zi0 = XT_AE_MOVINT32X2_FROMXTFLOATX2(z0);
+        zi0 = AE_XOR32(zi0, sgn);
+        z0 = XT_AE_MOVXTFLOATX2_FROMINT32X2(zi0);
+        XT_SASX2IP(z0, aZ, pZ);
+      }    
+    }
+    XT_SASX2POSFP(aZ, pZ);
+  }
+} /* mypowf() */
+void xa_nn_elm_pow_f32(   FLOAT32 * restrict z, 
+            const FLOAT32 * restrict x, 
+            const FLOAT32 * restrict y, 
+            int N )
+{
+  const int blkSize = MAX_ALLOCA_SZ/sz_f32;
+  /* Allocate a fixed-size scratch area on the stack. */
+  FLOAT32 ALIGN(16) scr[blkSize];
+  int M;
+  if ( N<=0 ) return;
+  M=N&~1;
+  if ( M )
+  {
+    mypowf(scr,z,x,y,M); 
+    y += M;
+    x += M;
+    z += M;
+    N&=1;
+  }
+  if (N) 
+  {     // processing the tail
+    static const int32_t c_tbl[] =
+    {
+      -126,
+      -150,
+      (int32_t)0x007FFFFF,/* max denormalized floating-point number / mantissa mask */
+      (int32_t)0x4B800000,/* 2^24 */
+      (int32_t)0x3F3504F3,/* sqrt(0.5) */
+      (int32_t)0x3F000000,/*  0.5 */
+      (int32_t)0xBF000000,/* -0.5 */
+      -252,
+      254
+    };
+    xtfloat x0, y0, t0, ef0, t1, t2;
+    xtfloat xy, dxy, z0, c0, c1;
+    xtfloat p0, p1, p2, p3, p4, p5, p6, p7, p8, p9;
+    xtfloat p10, p11, p12, p13, w0, w1;
+    xtbool bdenorm, bsmall;
+    ae_int32 e0, xi0, ex0;
+    x0=XT_LSI((const xtfloat*)x,0);
+
+    x0 = XT_ABS_S(x0);
+
+    /* process denormalized values */
+    bdenorm = xtbool2_extract_0(XT_OLE_S(x0, XT_LSI((xtfloat*)c_tbl, 2 * 4)));
+    t0 = XT_MUL_S(x0, XT_LSI((xtfloat*)c_tbl, 3 * 4));
+    XT_MOVT_S(x0, t0, (bdenorm));
+    e0 = AE_L32_I((ae_int32 *)c_tbl, 0 * 4);;
+    AE_MOVT_32(e0, AE_L32_I((ae_int32 *)c_tbl, 1 * 4), (bdenorm));
+    /* extract exponent */
+    xi0 = XT_AE_MOVINT32X2_FROMXTFLOATX2(x0);
+    ex0 = AE_SRLI32(xi0, 23);
+    e0 = AE_ADD32(e0, ex0);
+    /* extract mantissa */
+    ex0 = XT_AE_MOVINT32X2_FROMXTFLOATX2(XT_LSI((xtfloat*)c_tbl, 2 * 4));/* load mantissa mask */ //!!!!!!!!!!!!!
+    xi0 = AE_AND32(xi0, ex0);
+    xi0 = AE_OR32(xi0, AE_L32_I((ae_int32 *)c_tbl, 5 * 4));
+    x0 = XT_AE_MOVXTFLOATX2_FROMINT32X2(xi0);
+    /* adjust the mantissa to range [ sqrt(0.5) ; sqrt(2.0) ) */
+    
+    bsmall = xtbool2_extract_0(XT_OLT_S(x0, XT_LSI((xtfloat*)c_tbl, 4 * 4)));
+
+
+    t0 = XT_ADD_S(x0, x0);
+    ex0 = AE_SUB32(e0, 1);
+    XT_MOVT_S(x0, t0, bsmall);
+    AE_MOVT_32(e0, ex0, bsmall);
+    x0 = XT_SUB_S(1.0f, x0); //!!!
+    ef0 = XT_FLOAT_S(e0, 0); //!!!
+
+    /* evaluate polynomial approximation */
+    /* Load table of coefficients */
+
+    p0 = XT_LSI((const xtfloat *)xa_nnlib_log2f_coef, 0 * 4);
+    p1 = XT_LSI((const xtfloat *)xa_nnlib_log2f_coef, 1 * 4);
+    p2 = XT_LSI((const xtfloat *)xa_nnlib_log2f_coef, 2 * 4);
+    p3 = XT_LSI((const xtfloat *)xa_nnlib_log2f_coef, 3 * 4);
+    p4 = XT_LSI((const xtfloat *)xa_nnlib_log2f_coef, 4 * 4);
+    p5 = XT_LSI((const xtfloat *)xa_nnlib_log2f_coef, 5 * 4);
+    p6 = XT_LSI((const xtfloat *)xa_nnlib_log2f_coef, 6 * 4);
+    p7 = XT_LSI((const xtfloat *)xa_nnlib_log2f_coef, 7 * 4);
+    p8 = XT_LSX((const xtfloat *)xa_nnlib_log2f_coef, 8 * 4);
+    p9 = XT_LSX((const xtfloat *)xa_nnlib_log2f_coef, 9 * 4);
+    
+
+    XT_MADD_S(p1, x0, p0);
+    XT_MADD_S(p2, x0, p1);
+    XT_MADD_S(p3, x0, p2);
+    XT_MADD_S(p4, x0, p3);
+    XT_MADD_S(p5, x0, p4);
+    XT_MADD_S(p6, x0, p5);
+    XT_MADD_S(p7, x0, p6);
+    XT_MADD_S(p8, x0, p7);
+    XT_MADD_S(p9, x0, p8);
+    t2 = p9;
+
+
+    p10 = XT_LSX((const xtfloat *)xa_nnlib_log2f_coef, 10 * 4);
+    p11 = XT_LSX((const xtfloat *)xa_nnlib_log2f_coef, 11 * 4);
+    p12 = XT_LSX((const xtfloat *)xa_nnlib_log2f_coef, 12 * 4);
+    p13 = XT_LSX((const xtfloat *)xa_nnlib_log2f_coef, 13 * 4);
+
+    /* next coefficients are computed in extended precision */
+    t0 = XT_MUL_S(x0, t2); t1 = t0;
+    XT_MSUB_S(t1, x0, t2);
+    w0 = XT_ADD_S(t0, p10);
+    w1 = XT_SUB_S(w0, p10);
+    w1 = XT_SUB_S(t0, w1);
+    w1 = XT_SUB_S(w1, t1);
+    t0 = w0; t1 = w1;
+    w0 = XT_MUL_S(x0, t0); w1 = w0;
+    XT_MSUB_S(w1, x0, t0); t0 = w0;
+    XT_MSUB_S(w1, x0, t1); t1 = w1;
+    w0 = XT_ADD_S(t0, p11);
+    w1 = XT_SUB_S(w0, p11);
+    w1 = XT_SUB_S(t0, w1);
+    w1 = XT_SUB_S(w1, t1);
+    t0 = w0; t1 = w1;
+    x0 = XT_NEG_S(x0);
+    w0 = XT_MUL_S(x0, t0); w1 = w0;
+    XT_MSUB_S(w1, x0, t0); t0 = w0;
+    XT_MSUB_S(w1, x0, t1); t1 = w1;
+    /* multiply by log2(e) */
+    w0 = XT_MUL_S(t0, p12); w1 = w0;
+    XT_MSUB_S(w1, t0, p12);
+    XT_MADD_S(w1, t1, p12);
+    XT_MSUB_S(w1, t0, p13);
+    t0 = w0; t1 = w1;
+    /* add exponent */
+    w0 = XT_ADD_S(t0, ef0);
+    w1 = XT_SUB_S(w0, ef0);
+    w1 = XT_SUB_S(t0, w1);
+    t1 = XT_SUB_S(w1, t1);//!!!!
+    t0 = w0; // !!!!!
+
+    /* compute y*log2(x) and separate result into integer and fractional parts */
+    y0 = XT_LSI((const xtfloat*)y, 0);
+    xy = XT_FIROUND_S(XT_MUL_S(y0, t0));
+    dxy = XT_NEG_S(xy);
+    XT_MADD_S(dxy, y0, t0);
+    XT_MADD_S(dxy, y0, t1);
+    dxy = XT_MIN_S(dxy, (xtfloatx2)1.0f);
+    dxy = XT_MAX_S(dxy, (xtfloatx2)-1.0f);
+    /* compute 2^fract */
+    p0 = XT_LSI( (const xtfloat *)xa_nnlib_pow2f_coef, 0 * 4);
+    p1 = XT_LSI( (const xtfloat *)xa_nnlib_pow2f_coef, 1 * 4);
+    p2 = XT_LSI( (const xtfloat *)xa_nnlib_pow2f_coef, 2 * 4);
+    p3 = XT_LSI( (const xtfloat *)xa_nnlib_pow2f_coef, 3 * 4);
+    p4 = XT_LSI( (const xtfloat *)xa_nnlib_pow2f_coef, 4 * 4);
+    p5 = XT_LSI( (const xtfloat *)xa_nnlib_pow2f_coef, 5 * 4);
+    p6 = XT_LSI( (const xtfloat *)xa_nnlib_pow2f_coef, 6 * 4);
+    /* NOTE: do not change the order of computations and way of polynomial decomposition ! */
+    XT_MADD_S(p1, dxy, p0);
+    XT_MADD_S(p2, dxy, p1);
+    XT_MADD_S(p3, dxy, p2);
+    XT_MADD_S(p4, dxy, p3);
+    XT_MADD_S(p5, dxy, p4);
+    XT_MADD_S(p6, dxy, p5);
+    z0 = p6;
+    /* apply integer part */
+    e0 = XT_TRUNC_SX2(xy, 0);
+    e0 = AE_MAX32(e0, AE_L32_I((ae_int32 *)c_tbl, 7 * 4));
+    e0 = AE_MIN32(e0, AE_L32_X((ae_int32 *)c_tbl, 8 * 4));
+    e0 = AE_ADD32(e0, AE_L32_X((ae_int32 *)c_tbl, 8 * 4));
+    ex0 = AE_SRAI32(e0, 1);
+    e0 = AE_SUB32(e0, ex0);
+    ex0 = AE_SLLI32(ex0, 23);
+    e0 = AE_SLLI32(e0, 23);
+    c0 = XT_AE_MOVXTFLOATX2_FROMINT32X2(e0);
+    c1 = XT_AE_MOVXTFLOATX2_FROMINT32X2(ex0);
+    z0 = XT_MUL_S(z0, c1);
+    z0 = XT_MUL_S(z0, c0); //!!!!!!!!!!!!
+
+
+    /* Take sign of x and y */
+    {
+      xtbool2 bsx, bsy, b_yint, b_e0, b0, b_notspec;
+
+      xtbool2 b_yeqz, b_yinf, b_xeqz, b_xeq1, b_xinf;
+      xtbool2 b_NaN1, b_NaN2, b_one, b_Inf, b_zero;
+      uint32_t b0i, b1i;
+      uint32_t yeqz, yinf, xeqz, xeq1, xinf, sx, sy, yint;
+      uint32_t one, NaN1, Inf, zero;
+      xtfloat xabs, spec;
+      ae_int32 sgn, zi0;
+
+      x0 = XT_LSI((const xtfloat*)x, 0);
+      y0 = XT_LSI((const xtfloat*)y, 0);
+      xi0 = XT_AE_MOVINT32X2_FROMXTFLOATX2(x0);
+      bsx = (XT_OLT_S(x0, (xtfloat)0.0f));
+      bsy = (XT_OLT_S(y0, (xtfloat)0.0f));
+
+      xabs = XT_ABS_S(x0);
+      /* check if y is integer */
+      t0 = XT_FITRUNC_S(y0);
+      b_yint = (XT_OEQ_S(t0, y0));
+  
+      /* check if y is odd */
+      e0 = XT_TRUNC_S(y0, 0); //temp0
+      b_e0 = (AE_EQ32(e0, MAX_INT32));//~b_tmp0
+      b0i = AE_MOVAB2(b_e0);
+      b1i = AE_MOVAB2(b_yint);
+      b0i = b1i&(~b0i);
+      b0 = AE_MOVBA2(b0i);
+      AE_MOVF_32(e0, AE_ZERO32(), xtbool2_extract_0(b0));
+      e0 = AE_SLLI32(e0, 31);
+      sgn = AE_AND32(e0, xi0);
+      /* process special numbers */
+      b_yeqz = (XT_OEQ_S((xtfloatx2)0.0f, y0));            /*  y ==0      */
+      b_yinf = (XT_OEQ_S(XT_ABS_SX2(y0), xa_nnlib_pow_plusInff.f));     /* |y|==Inf    */
+      b_xeqz = (XT_OEQ_S(x0, (xtfloatx2)0.0f));            /*  x ==0      */
+      b_xeq1 = (XT_OEQ_S(xabs, (xtfloatx2)1.0f));          /* |x|==1      */
+      b_xinf = (XT_OEQ_S(xabs, xa_nnlib_pow_plusInff.f));               /* |x|==INF    */
+  
+      yint = AE_MOVAB2 (b_yint);
+      yeqz = AE_MOVAB2 (b_yeqz);
+      yinf = AE_MOVAB2 (b_yinf);
+      xeqz = AE_MOVAB2 (b_xeqz);
+      xeq1 = AE_MOVAB2 (b_xeq1);
+      xinf = AE_MOVAB2 (b_xinf);
+      sx = AE_MOVAB2 (bsx);
+      sy = AE_MOVAB2 (bsy);
+      
+      one = xeq1 & (yinf | (~sx));  /* |x|==1 && ( |y|==Inf || x>0 )                       */
+      one = one | yeqz;           /* ( |x|==1 && ( |y|==Inf || x>0 ) ) || y==0 --> z=1.0 */
+      NaN1 = sx&(~yint);          /* x<0 && y is not an integer --> z=NaN                */
+      Inf = xinf&(~sy);          /* x==INF && y>0 --> z=INF */
+      Inf = Inf | (xeqz & sy);    /* x==0   && y<0 --> z=INF */
+      zero = xeqz &(~sy);         /* x==0   && y>0 --> z=0.0 */
+      zero = zero | (xinf & sy);  /* x==INF && y<0 --> z=0.0 */
+  
+      b_NaN1 = AE_MOVBA2(NaN1);
+      b_NaN2 = XT_UN_SX2(x0, y0);         /* isnan(x) || isnan(y) --> z=NaN                      */
+      b_one = AE_MOVBA2(one);
+      b_Inf = AE_MOVBA2(Inf);
+      b_zero = AE_MOVBA2(zero);
+  
+      /* Save special numbers and mask for special numbers */
+      spec = (xtfloat)xa_nnlib_pow_qNaNf.f;
+      XT_MOVF_S(spec, 0.5f, xtbool2_extract_0(b_NaN1));
+      XT_MOVT_S(spec, 0.0f, xtbool2_extract_0(b_zero));
+      XT_MOVT_S(spec, xa_nnlib_pow_plusInff.f, xtbool2_extract_0(b_Inf));
+      XT_MOVT_S(spec, xa_nnlib_pow_qNaNf.f, xtbool2_extract_0(b_NaN2));
+      XT_MOVT_S(spec, 1.0f, xtbool2_extract_0(b_one));
+  
+      b_notspec = XT_OEQ_S(spec, 0.5f);
+      /* Replace result with special numbers if needed */
+      XT_MOVF_S(z0, spec, xtbool2_extract_0(b_notspec));
+      /* Restore sign and store result */
+      zi0 = XT_AE_MOVINT32X2_FROMXTFLOATX2(z0);
+      zi0 = AE_XOR32(zi0, sgn);
+      z0 = XT_AE_MOVXTFLOATX2_FROMINT32X2(zi0);
+
+      XT_SSI(z0,(xtfloat*)z,0);
+    
+    }
+  }
+
+} /* vec_powf() */
+#else
+#define sz_f32    (int)sizeof(FLOAT32)
+void xa_nn_elm_pow_f32(FLOAT32 * restrict z,
+  const FLOAT32 * restrict x,
+  const FLOAT32 * restrict y,
+  int N)
+{
+
+  const int blkSizef = MAX_ALLOCA_SZ / sz_f32;
+  /* Allocate a fixed-size scratch area on the stack. */
+  float ALIGN(16) scr[blkSizef];
+  /* Table of different constants used in computations */
+  static const int32_t c_tbl[] =
+  {
+    -126,
+    -150,
+    (int32_t)0x007FFFFF,/* max denormalized floating-point number / mantissa mask */
+    (int32_t)0x4B800000,/* 2^24 */
+    (int32_t)0x3F3504F3,/* sqrt(0.5) */
+    (int32_t)0x3F000000,/*  0.5 */
+    (int32_t)0xBF000000,/* -0.5 */
+    -252,
+    254
+  };
+  int n;
+  const xtfloat     *          pX;
+  const xtfloat     *          pY;
+
+  const xtfloat     * restrict S_rd;
+  xtfloat     * restrict S_wr;
+  xtfloat     * restrict pZ;
+  const ae_int32      * restrict TBL;
+  const  xtfloat      * restrict TBL_LOG2;
+  const  xtfloat      * restrict TBL_POW2;
+  xtfloat x0, y0, z0, t0, t1, ef0;
+  xtfloat c2f, c3f, c4f;
+  xtfloat _0, _1, half;
+  ae_int32x2 c0i, c1i, c5i, c6i, c7i, c8i;
+  ae_int32 e0, xi0, yi0, ex0;
+  xtbool bsx, bsy, bdenorm, bsmall;
+
+  /* overall number of blocks; number of values in the current block */
+  int blkLen;
+  /* Block size, blkLen <= blkSize */
+  const int blkSize = MAX_ALLOCA_SZ / (3 * sz_f32);
+
+
+  if (N <= 0) return;
+
+  NASSERT_ALIGN16(scr);
+
+  /*
+  * Data are processed in blocks of scratch area size. Further, the algorithm
+  * implementation is splitted in order to feed the optimizing compiler with a
+  * few loops of managable size.
+  */
+
+  blkLen = 0;
+  TBL = (const ae_int32 *)c_tbl;
+  for (; N>0; N -= blkLen, x += blkSize, y += blkSize, z += blkSize)
+  {
+    blkLen = XT_MIN(N, blkSize);
+    _0 = 0.0f;
+    _1 = (1.0f);
+    half = (0.5f);
+    {
+      pX   = (const xtfloat*)x;
+      S_wr = (      xtfloat*)scr;
+     
+      for (n = 0; n<(blkLen); n++)
+      {
+        XT_LSIP(x0, pX, sz_f32);
+       
+        x0 = XT_ABS_S(x0);
+        c0i = AE_L32_I(TBL, 0 * 4); /* -126 */
+        c1i = AE_L32_I(TBL, 1 * 4); /* -150 */
+        c2f = XT_LSI((xtfloat*)TBL, 2 * 4);
+        c3f = XT_LSI((xtfloat*)TBL, 3 * 4);
+        /* process denormalized values */
+        bdenorm = XT_OLE_S(x0, c2f);
+        t0 = XT_MUL_S(x0, c3f);
+        XT_MOVT_S(x0, t0, bdenorm);
+        e0 = c0i;
+        
+        AE_MOVT_32(e0, c1i, bdenorm);
+        /* extract exponent */
+        xi0 = XT_RFR(x0);
+        ex0 = AE_SRLI32(xi0, 23);
+        e0 = AE_ADD32(e0, ex0);
+        /* extract mantissa */
+        ex0 = XT_RFR(c2f);/* load mantissa mask */ //!!!!!!!!!!!!!
+        c5i = AE_L32_I(TBL, 5 * 4);/*  0.5 */
+        xi0 = AE_AND32(xi0, ex0);
+        xi0 = AE_OR32(xi0, c5i);
+        x0 = XT_WFR(xi0);
+        /* adjust the mantissa to range [ sqrt(0.5) ; sqrt(2.0) ) */
+        c4f = XT_LSI((xtfloat*)TBL, 4 * 4);
+        bsmall = XT_OLT_S(x0, c4f);
+        t0 = XT_ADD_S(x0, x0);
+        ex0 = AE_SUB32(e0, 1);
+        XT_MOVT_S(x0, t0, bsmall);
+        AE_MOVT_32(e0, ex0, bsmall);
+        x0 = XT_SUB_S(_1, x0); //!!!
+        ef0 = XT_FLOAT_S(e0, 0); //!!!
+        XT_SSIP(x0, S_wr, sz_f32);
+        XT_SSIP(ef0, S_wr, 2 * sz_f32);
+
+      }
+    }
+    __Pragma("no_reorder");
+    /* */
+    {
+      xtfloat p0, p1, p2, p3, p4, p5, p6, p7, p8, p9;
+      xtfloat p10, p11, p12, p13;
+      xtfloat t2, w0, w1;
+      S_wr = (      xtfloat*)scr + 2;
+      S_rd = (const xtfloat*)scr;
+      TBL_LOG2 = (const xtfloat *)xa_nnlib_log2f_coef;
+   
+      for (n = 0; n<(blkLen); n++)
+      {
+        XT_LSIP(x0, S_rd, 3*sz_f32);
+
+        /* evaluate polynomial approximation */
+        /* Load table of coefficients */
+
+         p0 = XT_LSI(TBL_LOG2, 0 * 4);
+         p1 = XT_LSI(TBL_LOG2, 1 * 4);
+         p2 = XT_LSI(TBL_LOG2, 2 * 4);
+         p3 = XT_LSI(TBL_LOG2, 3 * 4);
+         p4 = XT_LSI(TBL_LOG2, 4 * 4);
+         p5 = XT_LSI(TBL_LOG2, 5 * 4);
+         p6 = XT_LSI(TBL_LOG2, 6 * 4);
+         p7 = XT_LSI(TBL_LOG2, 7 * 4);
+         p8 = XT_LSX(TBL_LOG2, 8 * 4);
+         p9 = XT_LSX(TBL_LOG2, 9 * 4);
+       
+         XT_MADD_S(p1, x0, p0);
+         XT_MADD_S(p2, x0, p1);
+         XT_MADD_S(p3, x0, p2);
+         XT_MADD_S(p4, x0, p3);
+         XT_MADD_S(p5, x0, p4);
+         XT_MADD_S(p6, x0, p5);
+         XT_MADD_S(p7, x0, p6);
+         XT_MADD_S(p8, x0, p7);
+         XT_MADD_S(p9, x0, p8);
+         t2 = p9;
+         XT_SSIP(t2, S_wr, 3 * sz_f32);
+      }
+      S_wr = (      xtfloat*)scr;
+      S_rd = (const xtfloat*)scr;
+ 
+      for (n = 0; n<(blkLen); n++)
+      {
+        p10 = XT_LSX(TBL_LOG2, 10 * 4);
+        p11 = XT_LSX(TBL_LOG2, 11 * 4);
+        p12 = XT_LSX(TBL_LOG2, 12 * 4);
+        p13 = XT_LSX(TBL_LOG2, 13 * 4);
+      
+        XT_LSIP(x0, S_rd, sz_f32);
+        XT_LSIP(ef0, S_rd, sz_f32);
+        XT_LSIP(t2, S_rd, sz_f32);
+      
+        /* next coefficients are computed in extended precision */
+        t0 = XT_MUL_S(x0, t2); t1 = t0;
+        XT_MSUB_S(t1, x0, t2);
+        w0 = XT_ADD_S(t0, p10);
+        w1 = XT_SUB_S(w0, p10);
+        w1 = XT_SUB_S(t0, w1);
+        w1 = XT_SUB_S(w1, t1);
+        t0 = w0; t1 = w1;
+        w0 = XT_MUL_S(x0, t0); w1 = w0;
+        XT_MSUB_S(w1, x0, t0); t0 = w0;
+        XT_MSUB_S(w1, x0, t1); t1 = w1;
+        w0 = XT_ADD_S(t0, p11);
+        w1 = XT_SUB_S(w0, p11);
+        w1 = XT_SUB_S(t0, w1);
+        w1 = XT_SUB_S(w1, t1);
+        t0 = w0; t1 = w1;
+        x0 = XT_NEG_S(x0);
+        w0 = XT_MUL_S(x0, t0); w1 = w0;
+        XT_MSUB_S(w1, x0, t0); t0 = w0;
+        XT_MSUB_S(w1, x0, t1); t1 = w1;
+        /* multiply by log2(e) */
+        w0 = XT_MUL_S(t0, p12); w1 = w0;
+        XT_MSUB_S(w1, t0, p12);
+        XT_MADD_S(w1, t1, p12);
+        XT_MSUB_S(w1, t0, p13);
+        t0 = w0; t1 = w1;
+        /* add exponent */
+        w0 = XT_ADD_S(t0, ef0);
+        w1 = XT_SUB_S(w0, ef0);
+        w1 = XT_SUB_S(t0, w1);
+        t1 = XT_SUB_S(w1, t1);//!!!!
+        t0 = w0; // !!!!!
+        XT_SSIP(t0, S_wr, sz_f32);
+        XT_SSIP(t1, S_wr, sz_f32);
+      }
+    }
+    __Pragma("no_reorder");
+    /* */
+    {
+      xtfloat xy, dxy, c0, c1, _m1;;
+      xtfloat p0, p1, p2, p3, p4, p5, p6;
+      S_wr = (      xtfloat*)scr;
+      S_rd = (const xtfloat*)scr;
+      TBL_POW2 = (const xtfloat *)xa_nnlib_pow2f_coef;
+      pY = (const xtfloat*)y;
+      _m1 = -1.0f;
+      for (n = 0; n<(blkLen); n++)
+      {
+        XT_LSIP(t0, S_rd, sz_f32);
+        XT_LSIP(t1, S_rd, sz_f32);
+        XT_LSIP(y0, pY, sz_f32);
+        /* compute y*log2(x) and separate result into integer and fractional parts */
+        xy = XT_FLOAT_S(XT_ROUND_S(XT_MUL_S(y0, t0), 0), 0);
+        dxy = XT_NEG_S(xy);
+        XT_MADD_S(dxy, y0, t0);
+        XT_MADD_S(dxy, y0, t1);
+        c5i = AE_L32_I(TBL, 5 * 4);/*  0.5 */
+        c6i = AE_L32_I(TBL, 6 * 4);/*  -0.5 */
+        dxy = XT_MIN_S(dxy, _1);
+        dxy = XT_MAX_S(dxy, _m1);
+        /* compute 2^fract */
+        p0 = XT_LSI(TBL_POW2, 0 * 4);
+        p1 = XT_LSI(TBL_POW2, 1 * 4);
+        p2 = XT_LSI(TBL_POW2, 2 * 4);
+        p3 = XT_LSI(TBL_POW2, 3 * 4);
+        p4 = XT_LSI(TBL_POW2, 4 * 4);
+        p5 = XT_LSI(TBL_POW2, 5 * 4);
+        p6 = XT_LSI(TBL_POW2, 6 * 4);
+        /* NOTE: do not change the order of computations and way of polynomial decomposition ! */
+        XT_MADD_S(p1, dxy, p0);
+        XT_MADD_S(p2, dxy, p1);
+        XT_MADD_S(p3, dxy, p2);
+        XT_MADD_S(p4, dxy, p3);
+        XT_MADD_S(p5, dxy, p4);
+        XT_MADD_S(p6, dxy, p5);
+        z0 = p6;
+        /* apply integer part */
+        e0 = XT_TRUNC_S(xy, 0);
+        c7i = AE_L32_I(TBL, 7 * 4);/* -252 */
+        c8i = AE_L32_X(TBL, 8 * 4);/* 254 */
+        e0 = AE_MAX32(e0, c7i);
+        e0 = AE_MIN32(e0, c8i);
+        e0 = AE_ADD32(e0, c8i);
+        ex0 = AE_SRAI32(e0, 1);
+        e0 = AE_SUB32(e0, ex0);
+        ex0 = AE_SLLI32(ex0, 23);
+        e0 = AE_SLLI32(e0, 23);
+        
+        c0 = XT_WFR(e0);
+        c1 = XT_WFR(ex0);
+        z0 = XT_MUL_S(z0, c1);
+        z0 = XT_MUL_S(z0, c0); //!!!!!!!!!!!!
+        XT_SSIP(z0, S_wr, sz_f32);
+
+      }
+    }
+    __Pragma("no_reorder");
+    /* */
+    {
+      xtbool b_yint, b_e0, b0, b_notspec;
+      xtbool b_yeqz, b_yinf, b_xeqz, b_xeq1, b_xinf;
+      xtbool b_NaN1, b_NaN2, b_one, b_Inf, b_zero;
+      uint32_t b0i, b1i;
+      uint32_t yeqz, yinf, xeqz, xeq1, xinf, sx, sy, yint;
+      uint32_t one, NaN1, Inf, zero;
+      xtfloat xabs, spec;
+      ae_int32x2 sgn, zi0;
+
+      S_rd = (const xtfloat*)scr;
+      pY = (const xtfloat*)y;
+      pX = (const xtfloat*)x;
+      pZ = (xtfloat*)z;
+
+      for (n = 0; n<(blkLen); n++)
+      {
+        XT_LSIP(z0, S_rd, sz_f32);
+        XT_LSIP(x0, pX, sz_f32);
+        XT_LSIP(y0, pY, sz_f32);
+
+        /* Take sign of x and y */
+        xi0 = XT_RFR(x0);
+        yi0 = XT_RFR(y0);
+        bsx = XT_OLT_S(x0, (xtfloat)0.0f);
+        bsy = XT_OLT_S(y0, (xtfloat)0.0f);
+      
+        xabs = XT_ABS_S(x0);
+        /* check if y is integer */
+        {   /* validate if y is integral - all numbers bigger than 2^23 are assumed as integral */
+          xtfloat t, c;
+          t = XT_ABS_S((xtfloat)y0);
+          c = 8388608.f;
+          XT_MOVT_S(c, t, XT_ULT_S(t, 8388608.f));
+          t = c;
+          t0 = XT_FLOAT_S(XT_TRUNC_S(t, 0), 0);
+          b_yint = XT_OEQ_S(XT_FLOAT_S(XT_TRUNC_S(t, 0), 0), t);
+        }
+      
+        /* check if y is odd */
+        e0 = XT_TRUNC_S(y0, 0); //temp0
+        b_e0 = xtbool2_extract_0(AE_EQ32(e0, MAX_INT32));//~b_tmp0
+        b0i = AE_MOVAB(b_e0);
+        b1i = AE_MOVAB(b_yint);
+        b0i = b1i&(~b0i);
+        b0 = AE_MOVBA(b0i);
+        AE_MOVF_32(e0, AE_ZERO32(), b0);
+        e0 = AE_SLLI32(e0, 31);
+        sgn = AE_AND32(e0, xi0);
+        /* process special numbers */
+        b_yeqz = XT_OEQ_S((xtfloat)0.0f, y0);            /*  y ==0      */
+        b_yinf = XT_OEQ_S(XT_ABS_S(y0), xa_nnlib_pow_plusInff.f);     /* |y|==Inf    */
+        b_xeqz = XT_OEQ_S(x0, (xtfloat)0.0f);            /*  x ==0      */
+        b_xeq1 = XT_OEQ_S(xabs, (xtfloat)1.0f);          /* |x|==1      */
+        b_xinf = XT_OEQ_S(xabs, xa_nnlib_pow_plusInff.f);               /* |x|==INF    */
+      
+        yint = AE_MOVAB(b_yint);
+        yeqz = AE_MOVAB(b_yeqz);
+        yinf = AE_MOVAB(b_yinf);
+        xeqz = AE_MOVAB(b_xeqz);
+        xeq1 = AE_MOVAB(b_xeq1);
+        xinf = AE_MOVAB(b_xinf);
+        sx = AE_MOVAB(bsx);
+        sy = AE_MOVAB(bsy);
+        one = xeq1 & (yinf | (~sx));  /* |x|==1 && ( |y|==Inf || x>0 )                       */
+        one = one | yeqz;           /* ( |x|==1 && ( |y|==Inf || x>0 ) ) || y==0 --> z=1.0 */
+        NaN1 = sx&(~yint);          /* x<0 && y is not an integer --> z=NaN                */
+        Inf = xinf&(~sy);          /* x==INF && y>0 --> z=INF */
+        Inf = Inf | (xeqz & sy);    /* x==0   && y<0 --> z=INF */
+        zero = xeqz &(~sy);         /* x==0   && y>0 --> z=0.0 */
+        zero = zero | (xinf & sy);  /* x==INF && y<0 --> z=0.0 */
+      
+        b_NaN1 = AE_MOVBA(NaN1);
+        b_NaN2 = XT_UN_S(x0, y0);         /* isnan(x) || isnan(y) --> z=NaN                      */
+        b_one = AE_MOVBA(one);
+        b_Inf = AE_MOVBA(Inf);
+        b_zero = AE_MOVBA(zero);
+      
+        /* Save special numbers and mask for special numbers */
+        spec = (xtfloat)xa_nnlib_pow_qNaNf.f;
+        XT_MOVF_S(spec, half, b_NaN1);
+        XT_MOVT_S(spec, _0, b_zero);
+        XT_MOVT_S(spec, xa_nnlib_pow_plusInff.f, b_Inf);
+        XT_MOVT_S(spec, xa_nnlib_pow_qNaNf.f, b_NaN2);
+        XT_MOVT_S(spec, _1, b_one);
+      
+        b_notspec = XT_OEQ_S(spec, half);
+        /* Replace result with special numbers if needed */
+        XT_MOVF_S(z0, spec, b_notspec);
+        /* Restore sign and store result */
+        zi0 = XT_RFR(z0);
+        zi0 = AE_XOR32(zi0, sgn);
+        z0 = XT_WFR(zi0);
+        XT_SSIP(z0, pZ, sz_f32);
+      }
+    }
+  }
+
+} /* vec_powf() */
+#endif
diff --git a/backends/cadence/runtime/TARGETS b/backends/cadence/runtime/TARGETS
index 1b55a7d541..95a7bdc369 100644
--- a/backends/cadence/runtime/TARGETS
+++ b/backends/cadence/runtime/TARGETS
@@ -7,6 +7,8 @@ python_library(
     srcs = [
         "__init__.py",
         "executor.py",
+        "runtime.py",
+        "utils.py"
     ] + glob([
         "xtsc-cfg/**/*",
     ]),
@@ -16,6 +18,7 @@ python_library(
         "//executorch/devtools/bundled_program:config",
         "//executorch/devtools/bundled_program:core",
         "//executorch/devtools/bundled_program/serialize:lib",
+        "//executorch/devtools:lib",
         "//executorch/exir:lib",
     ],
 )
diff --git a/backends/cadence/runtime/runtime.py b/backends/cadence/runtime/runtime.py
index bf2932d9c7..0268931c40 100644
--- a/backends/cadence/runtime/runtime.py
+++ b/backends/cadence/runtime/runtime.py
@@ -167,9 +167,7 @@ def run(
 
 
 def compare(
-    # pyre-fixme[2]: Parameter annotation cannot be `Any`.
     outputs: Any,
-    # pyre-fixme[2]: Parameter annotation cannot be `Any`.
     ref_outputs: Any,
     name: str = "",
     eps_error: float = 1e-1,
@@ -223,7 +221,6 @@ def run_and_compare(
     compare(outputs, ref_outputs, eps_error=eps_error, eps_warn=eps_warn)
 
 
-# pyre-fixme[24]: Generic type `np.ndarray` expects 2 type parameters.
 def to_nd_array(v: Union[bool, numbers.Number, ndarray, torch.Tensor]) -> np.ndarray:
     if isinstance(v, np.ndarray):
         return v
diff --git a/backends/cadence/runtime/utils.py b/backends/cadence/runtime/utils.py
index b3ed622e8b..0a85b6dd61 100644
--- a/backends/cadence/runtime/utils.py
+++ b/backends/cadence/runtime/utils.py
@@ -13,12 +13,11 @@
 import torch
 
 
-# pyre-fixme[24]: Generic type `np.ndarray` expects 2 type parameters.
-def distance(fn: Callable[[np.ndarray, np.ndarray], float]) -> Callable[
+def distance(
+    fn: Callable[[np.ndarray, np.ndarray], float],
+) -> Callable[
     [
-        # pyre-fixme[24]: Generic type `np.ndarray` expects 2 type parameters.
         typing.Union[np.ndarray, torch._tensor.Tensor],
-        # pyre-fixme[24]: Generic type `np.ndarray` expects 2 type parameters.
         typing.Union[np.ndarray, torch._tensor.Tensor],
     ],
     float,
@@ -27,9 +26,7 @@ def distance(fn: Callable[[np.ndarray, np.ndarray], float]) -> Callable[
     # the distance between two N-D tensors given a function. This can be a RMS
     # function, maximum abs diff, or any kind of distance function.
     def wrapper(
-        # pyre-fixme[24]: Generic type `np.ndarray` expects 2 type parameters.
         a: Union[np.ndarray, torch.Tensor],
-        # pyre-fixme[24]: Generic type `np.ndarray` expects 2 type parameters.
         b: Union[np.ndarray, torch.Tensor],
     ) -> float:
         # convert a and b to np.ndarray type fp64
@@ -68,24 +65,20 @@ def wrapper(
 
 
 @distance
-# pyre-fixme[24]: Generic type `np.ndarray` expects 2 type parameters.
 def rms(a: np.ndarray, b: np.ndarray) -> float:
     return ((a - b) ** 2).mean() ** 0.5
 
 
 @distance
-# pyre-fixme[24]: Generic type `np.ndarray` expects 2 type parameters.
 def max_abs_diff(a: np.ndarray, b: np.ndarray) -> float:
     return np.abs(a - b).max()
 
 
 @distance
-# pyre-fixme[24]: Generic type `np.ndarray` expects 2 type parameters.
 def max_rel_diff(x: np.ndarray, x_ref: np.ndarray) -> float:
     return np.abs((x - x_ref) / x_ref).max()
 
 
-# pyre-fixme[24]: Generic type `np.ndarray` expects 2 type parameters.
 def to_np_arr_fp64(x: Union[np.ndarray, torch.Tensor]) -> np.ndarray:
     if isinstance(x, torch.Tensor):
         x = x.detach().cpu().numpy()
@@ -94,11 +87,8 @@ def to_np_arr_fp64(x: Union[np.ndarray, torch.Tensor]) -> np.ndarray:
     return x
 
 
-# pyre-fixme[3]: Return type must be annotated.
 def normalized_rms(
-    # pyre-fixme[24]: Generic type `np.ndarray` expects 2 type parameters.
     predicted: Union[np.ndarray, torch.Tensor],
-    # pyre-fixme[24]: Generic type `np.ndarray` expects 2 type parameters.
     ground_truth: Union[np.ndarray, torch.Tensor],
 ):
     num = rms(predicted, ground_truth)
diff --git a/backends/qualcomm/runtime/targets.bzl b/backends/qualcomm/runtime/targets.bzl
index ac65b442aa..be4c56b587 100644
--- a/backends/qualcomm/runtime/targets.bzl
+++ b/backends/qualcomm/runtime/targets.bzl
@@ -35,45 +35,49 @@ def define_common_targets():
         ],
     )
 
-    runtime.cxx_library(
-        name = "runtime",
-        srcs = glob(
-            [
-                "*.cpp",
-                "backends/*.cpp",
-                "backends/htpbackend/*.cpp",
-                "backends/htpbackend/aarch64/*.cpp",
+    # "runtime" target is used for offline compile, can be renamed to runtime_aot_build as a BE.
+    for include_aot_qnn_lib in (True, False):
+        qnn_build_suffix = ("" if include_aot_qnn_lib else "_android_build")
+        runtime.cxx_library(
+            name = "runtime" + qnn_build_suffix,
+            srcs = glob(
+                [
+                    "*.cpp",
+                    "backends/*.cpp",
+                    "backends/htpbackend/*.cpp",
+                    "backends/htpbackend/aarch64/*.cpp",
+                ],
+                exclude = ["Logging.cpp"],
+            ),
+            exported_headers = glob(
+                [
+                    "*.h",
+                    "backends/*.h",
+                    "backends/htpbackend/*.h",
+                ],
+                exclude = ["Logging.h"],
+            ),
+            define_static_target = True,
+            link_whole = True,  # needed for executorch/examples/models/llama:main to register QnnBackend
+            platforms = [ANDROID],
+            visibility = ["@EXECUTORCH_CLIENTS"],
+            resources = ({
+                "qnn_lib": "fbsource//third-party/qualcomm/qnn/qnn-{0}:qnn_offline_compile_libs".format(get_qnn_library_verision()),
+                } if include_aot_qnn_lib else {
+            }),
+            deps = [
+                "fbsource//third-party/qualcomm/qnn/qnn-{0}:api".format(get_qnn_library_verision()),
+                ":logging",
+                "//executorch/backends/qualcomm:schema",
+                "//executorch/backends/qualcomm:qc_binary_info_schema",
+                "//executorch/backends/qualcomm/aot/ir:qcir_utils",
+                "//executorch/backends/qualcomm/aot/wrappers:wrappers",
+                "//executorch/runtime/backend:interface",
+                "//executorch/runtime/core:core",
+                "//executorch/extension/tensor:tensor",
             ],
-            exclude = ["Logging.cpp"],
-        ),
-        exported_headers = glob(
-            [
-                "*.h",
-                "backends/*.h",
-                "backends/htpbackend/*.h",
+            exported_deps = [
+                "//executorch/runtime/core/exec_aten/util:scalar_type_util",
+                "//executorch/runtime/core:event_tracer",
             ],
-            exclude = ["Logging.h"],
-        ),
-        define_static_target = True,
-        link_whole = True,  # needed for executorch/examples/models/llama:main to register QnnBackend
-        platforms = [ANDROID],
-        visibility = ["@EXECUTORCH_CLIENTS"],
-        resources = {
-            "qnn_lib": "fbsource//third-party/qualcomm/qnn/qnn-{0}:qnn_offline_compile_libs".format(get_qnn_library_verision()),
-        },
-        deps = [
-            "fbsource//third-party/qualcomm/qnn/qnn-{0}:api".format(get_qnn_library_verision()),
-            ":logging",
-            "//executorch/backends/qualcomm:schema",
-            "//executorch/backends/qualcomm:qc_binary_info_schema",
-            "//executorch/backends/qualcomm/aot/ir:qcir_utils",
-            "//executorch/backends/qualcomm/aot/wrappers:wrappers",
-            "//executorch/runtime/backend:interface",
-            "//executorch/runtime/core:core",
-            "//executorch/extension/tensor:tensor",
-        ],
-        exported_deps = [
-            "//executorch/runtime/core/exec_aten/util:scalar_type_util",
-            "//executorch/runtime/core:event_tracer",
-        ],
-    )
+        )
diff --git a/backends/qualcomm/targets.bzl b/backends/qualcomm/targets.bzl
index 14e02989e5..521152d279 100644
--- a/backends/qualcomm/targets.bzl
+++ b/backends/qualcomm/targets.bzl
@@ -120,7 +120,7 @@ def define_common_targets():
             "fbsource//third-party/qualcomm/qnn/qnn-{0}:api".format(get_qnn_library_verision()),
             "//executorch/runtime/backend:interface",
             "//executorch/runtime/core:core",
-            "//executorch/backends/qualcomm/runtime:runtime",
+            "//executorch/backends/qualcomm/runtime:runtime_android_build",
         ],
         exported_deps = [
             ":schema",
diff --git a/backends/vulkan/runtime/graph/ops/glsl/embedding.glsl b/backends/vulkan/runtime/graph/ops/glsl/embedding.glsl
index 5c3de75634..73a444cd84 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/embedding.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/embedding.glsl
@@ -47,9 +47,9 @@ void main() {
     const ivec3 in_lpos = ivec3(out_tidx.y, out_tidx.z * 4 + i, out_tidx.w / 4);
     const int in_texel_elem = load_texel_lpos(t_in, in_lpos, in_axis_map)[out_tidx.w % 4];
 
-    // Read weight tensor for embedding.
-    const ivec3 weight_lpos = ivec3(out_tidx.x, in_texel_elem, 0);
-    out_texel[i] = load_texel_lpos(t_weight, weight_lpos, weight_axis_map).x;
+    // Read weight tensor for embedding, it is height-packed.
+    const ivec3 weight_lpos = ivec3(out_tidx.x, in_texel_elem / 4, 0);
+    out_texel[i] = load_texel_lpos(t_weight, weight_lpos, weight_axis_map)[in_texel_elem % 4];
   }
 
   write_texel_lpos(t_out, out_lpos, out_texel, out_axis_map);
diff --git a/backends/vulkan/runtime/graph/ops/glsl/permute.glsl b/backends/vulkan/runtime/graph/ops/glsl/permute.glsl
index 8414d811fc..5378099d03 100644
--- a/backends/vulkan/runtime/graph/ops/glsl/permute.glsl
+++ b/backends/vulkan/runtime/graph/ops/glsl/permute.glsl
@@ -36,8 +36,10 @@ layout(set = 0, binding = 4) uniform PRECISION restrict Block {
 
 layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
 
+#extension GL_EXT_shader_explicit_arithmetic_types_int16 : require
+
 void main() {
-  const ivec3 pos = ivec3(gl_GlobalInvocationID);
+  const u16vec3 pos = u16vec3(gl_GlobalInvocationID);
 
   if (any(greaterThanEqual(pos, out_limits))) {
     return;
@@ -46,28 +48,34 @@ void main() {
   const int out_channel_4up = int(ch_info.x);
   const int in_channel_4up = int(ch_info.y);
   const int out_batch = int(sizes[3]);
-  const int max_dst_index = out_batch * out_channel_4up;
   VEC4_T outval = VEC4_T(0.0);
+  ivec4 v = ivec4(0); // holds b,c,h,w
+
+  v[out_ndims[2]] = pos.y;
+  v[out_ndims[3]] = pos.x;
+
+  const int dst_index = pos.z << 2;
+  int dst_out_index = dst_index / out_channel_4up;
+  int dst_out_lane = dst_index % out_channel_4up;
 
-  for (int j = 0; j < 4; ++j) {
-    int dst_index = pos.z * 4 + j;
-    if (dst_index >= max_dst_index) {
+  for (int j = 0; j < 4; ++j, ++dst_out_lane) {
+    if (dst_out_index >= out_batch) {
       // out of range
       break;
     }
 
-    ivec4 v = ivec4(0); // holds b,c,h,w
-    v[out_ndims[0]] = dst_index / out_channel_4up;
-    v[out_ndims[1]] = dst_index % out_channel_4up;
-    v[out_ndims[2]] = pos.y;
-    v[out_ndims[3]] = pos.x;
+    if (dst_out_lane == out_channel_4up) {
+      dst_out_lane = 0;
+      dst_out_index++;
+    }
+
+    v[out_ndims[0]] = dst_out_index;
+    v[out_ndims[1]] = dst_out_lane;
 
     int src_index = v[0] * in_channel_4up + v[1];
-    int w = v[3];
-    int h = v[2];
 
-    VEC4_T inval = VEC4_T(texelFetch(image_in, ivec3(w, h, src_index / 4), 0));
-    outval[j] = inval[src_index % 4];
+    VEC4_T inval = VEC4_T(texelFetch(image_in, u16vec3(v[3], v[2], src_index >> 2), 0));
+    outval[j] = inval[src_index & 0x3];
   }
 
   imageStore(image_out, pos, outval);
diff --git a/backends/vulkan/runtime/graph/ops/impl/Embedding.cpp b/backends/vulkan/runtime/graph/ops/impl/Embedding.cpp
index 05ebd3d1a6..8160908cc5 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Embedding.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/Embedding.cpp
@@ -15,13 +15,21 @@
 
 #include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
 
+#include <executorch/backends/vulkan/runtime/utils/StorageUtils.h>
+
 namespace vkcompute {
 
+using utils::GPUMemoryLayout;
+using utils::StorageType;
+
 void check_embedding_args(
     const api::vTensor& weight,
     const api::vTensor& in,
     const api::vTensor& out) {
-  VK_CHECK_COND(check_packed_dim_is(weight, WHCN::kChannelsDim));
+  // The packing logic may not be trivial here. Input and output are Channel
+  // Packed, which is default for the Vulkan backend. However, weight vector is
+  // height-packed instead of channel-packed for space reason.
+  VK_CHECK_COND(check_packed_dim_is(weight, WHCN::kHeightDim));
   VK_CHECK_COND(check_packed_dim_is(in, WHCN::kChannelsDim));
   VK_CHECK_COND(check_packed_dim_is(out, WHCN::kChannelsDim));
 }
@@ -58,7 +66,12 @@ void add_embedding_node(
 void embedding(ComputeGraph& graph, const std::vector<ValueRef>& args) {
   ValueRef in = args[1];
   ValueRef out = args[5];
-  ValueRef weight = prepack_standard_like(graph, args[0], out);
+
+  ValueRef weight = prepack_standard(
+      graph,
+      args[0],
+      StorageType::TEXTURE_2D,
+      GPUMemoryLayout::TENSOR_HEIGHT_PACKED);
 
   add_embedding_node(graph, weight, in, out);
 }
diff --git a/backends/vulkan/serialization/vulkan_graph_serialize.py b/backends/vulkan/serialization/vulkan_graph_serialize.py
index 37785f4752..c97ea69a43 100644
--- a/backends/vulkan/serialization/vulkan_graph_serialize.py
+++ b/backends/vulkan/serialization/vulkan_graph_serialize.py
@@ -1,6 +1,8 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
+# pyre-strict
+#
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
@@ -19,9 +21,9 @@
     VkBytes,
     VkGraph,
 )
-from executorch.exir._serialize._dataclass import _DataclassEncoder
+from executorch.exir._serialize._dataclass import _DataclassEncoder, _json_to_dataclass
 
-from executorch.exir._serialize._flatbuffer import _flatc_compile
+from executorch.exir._serialize._flatbuffer import _flatc_compile, _flatc_decompile
 
 
 def convert_to_flatbuffer(vk_graph: VkGraph) -> bytes:
@@ -40,6 +42,25 @@ def convert_to_flatbuffer(vk_graph: VkGraph) -> bytes:
             return output_file.read()
 
 
+def flatbuffer_to_vk_graph(flatbuffers: bytes) -> VkGraph:
+    # Following similar (de)serialization logic on other backends:
+    # https://github.com/pytorch/executorch/blob/main/backends/qualcomm/serialization/qc_schema_serialize.py#L33
+    with tempfile.TemporaryDirectory() as d:
+        schema_path = os.path.join(d, "schema.fbs")
+        with open(schema_path, "wb") as schema_file:
+            schema_file.write(pkg_resources.resource_string(__name__, "schema.fbs"))
+
+        bin_path = os.path.join(d, "schema.bin")
+        with open(bin_path, "wb") as bin_file:
+            bin_file.write(flatbuffers)
+
+        _flatc_decompile(d, schema_path, bin_path, ["--raw-binary"])
+
+        json_path = os.path.join(d, "schema.json")
+        with open(json_path, "rb") as output_file:
+            return _json_to_dataclass(json.load(output_file), VkGraph)
+
+
 @dataclass
 class VulkanDelegateHeader:
     # Defines the byte region that each component of the header corresponds to
diff --git a/backends/vulkan/test/test_serialization.py b/backends/vulkan/test/test_serialization.py
index eb112d7b12..c373f5216d 100644
--- a/backends/vulkan/test/test_serialization.py
+++ b/backends/vulkan/test/test_serialization.py
@@ -1,6 +1,8 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # All rights reserved.
 #
+# pyre-strict
+#
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
@@ -11,9 +13,17 @@
 
 import torch
 
-from executorch.backends.vulkan.serialization.vulkan_graph_schema import VkGraph
+from executorch.backends.vulkan.serialization.vulkan_graph_schema import (
+    IntList,
+    OperatorCall,
+    String,
+    VkGraph,
+    VkValue,
+)
 
 from executorch.backends.vulkan.serialization.vulkan_graph_serialize import (
+    convert_to_flatbuffer,
+    flatbuffer_to_vk_graph,
     serialize_vulkan_graph,
     VulkanDelegateHeader,
 )
@@ -36,7 +46,7 @@ def _generate_random_const_tensors(self, num_tensors: int) -> List[torch.Tensor]
 
         return tensors
 
-    def test_serialize_vulkan_binary(self):
+    def test_serialize_vulkan_binary(self) -> None:
         vk_graph = VkGraph(
             version="0",
             chain=[],
@@ -93,3 +103,33 @@ def test_serialize_vulkan_binary(self):
 
             tensor_bytes = bytes(array)
             self.assertEqual(constant_data_bytes, tensor_bytes)
+
+    def test_serialize_deserialize_vkgraph(self) -> None:
+        in_vk_graph = VkGraph(
+            version="1",
+            chain=[
+                OperatorCall(node_id=1, name="foo", args=[1, 2, 3]),
+                OperatorCall(node_id=2, name="bar", args=[]),
+            ],
+            values=[
+                VkValue(
+                    value=String(
+                        string_val="abc",
+                    ),
+                ),
+                VkValue(
+                    value=IntList(
+                        items=[-1, -4, 2],
+                    ),
+                ),
+            ],
+            input_ids=[],
+            output_ids=[],
+            constants=[],
+            shaders=[],
+        )
+
+        bs = convert_to_flatbuffer(in_vk_graph)
+        out_vk_graph = flatbuffer_to_vk_graph(bs)
+
+        self.assertEqual(in_vk_graph, out_vk_graph)
diff --git a/backends/xnnpack/third-party/XNNPACK b/backends/xnnpack/third-party/XNNPACK
index d5d572e46e..4ea82e595b 160000
--- a/backends/xnnpack/third-party/XNNPACK
+++ b/backends/xnnpack/third-party/XNNPACK
@@ -1 +1 @@
-Subproject commit d5d572e46ed3929fa3e67f6174192893943cf724
+Subproject commit 4ea82e595b36106653175dcb04b2aa532660d0d8
diff --git a/backends/xnnpack/third-party/xnnpack.buck.bzl b/backends/xnnpack/third-party/xnnpack.buck.bzl
index d2068661fe..6ce0316010 100644
--- a/backends/xnnpack/third-party/xnnpack.buck.bzl
+++ b/backends/xnnpack/third-party/xnnpack.buck.bzl
@@ -42,7 +42,7 @@ def define_xnnpack():
             "XNNPACK/src/mutex.c",
             "XNNPACK/src/normalization.c",
             "XNNPACK/src/operator-utils.c",
-            "XNNPACK/src/packing.cc",
+            "XNNPACK/src/reference/packing.cc",
         ],
         headers = get_xnnpack_headers(),
         header_namespace = "",
@@ -67,7 +67,7 @@ def define_xnnpack():
     # @lint-ignore BUCKLINT: native and fb_native are explicitly forbidden in fbcode.
     native.cxx_library(
         name = "subgraph",
-        srcs = SUBGRAPH_SRCS,
+        srcs = SUBGRAPH_SRCS + ["XNNPACK/src/datatype.c"],
         compiler_flags = [
             "-Wno-error=missing-braces",  # required since the SGX toolchain does not have this by default
         ],
@@ -1076,6 +1076,8 @@ def define_xnnpack():
             "XNNPACK/src/configs/hardware-config.c",
             "XNNPACK/src/microparams-init.c",
             "XNNPACK/src/microkernel-utils.c",
+            "XNNPACK/src/reference/binary-elementwise.cc",
+            "XNNPACK/src/reference/unary-elementwise.cc",
         ],
         headers = get_xnnpack_headers(),
         exported_headers = {
diff --git a/backends/xnnpack/third-party/xnnpack_src_defs.bzl b/backends/xnnpack/third-party/xnnpack_src_defs.bzl
index 038b90acab..8cb9affede 100644
--- a/backends/xnnpack/third-party/xnnpack_src_defs.bzl
+++ b/backends/xnnpack/third-party/xnnpack_src_defs.bzl
@@ -17,24 +17,14 @@ def prod_srcs_for_arch_wrapper(arch):
     return define_xnnpack_build_src(prod_srcs)
 
 def get_xnnpack_headers():
-    # XNNPACK Headers in the path containing xnnpack/ or configs/
-    # do not contain the src/ path. However headers not in xnnpack/ or
-    # configs/ are prepend with the src/ path. This function helps us
-    # to correctly parse all the header files to the correct name
     src_headers = subdir_glob([
         ("XNNPACK/src", "**/*.h"),
     ])
-    fixed_headers = {}
-    for k, v in src_headers.items():
-        new_key = k
-        if not k.startswith("xnnpack") and not k.startswith("configs"):
-            new_key = "src/{}".format(k)
-        fixed_headers[new_key] = v
     include_headers = subdir_glob([
         ("XNNPACK/include", "*.h"),
     ])
 
-    return fixed_headers | include_headers
+    return src_headers | include_headers
 
 OPERATOR_SRCS = define_xnnpack_build_src(_OPERATOR_SRCS)
 SUBGRAPH_SRCS = define_xnnpack_build_src(_SUBGRAPH_SRCS)
diff --git a/docs/source/executorch-arm-delegate-tutorial.md b/docs/source/executorch-arm-delegate-tutorial.md
index 25b5551b5e..855a828c23 100644
--- a/docs/source/executorch-arm-delegate-tutorial.md
+++ b/docs/source/executorch-arm-delegate-tutorial.md
@@ -322,7 +322,6 @@ ethos_u_build_dir=examples/arm/executor_runner/
 elf=$(find ${ethos_u_build_dir} -name "arm_executor_runner")
 
 FVP_Corstone_SSE-320_Ethos-U85                          \
-    -C mps4_board.subsystem.cpu0.CFGITCMSZ=11           \
     -C mps4_board.subsystem.ethosu.num_macs=${num_macs} \
     -C mps4_board.visualisation.disable-visualisation=1 \
     -C vis_hdlcd.disable_visualisation=1                \
diff --git a/examples/arm/ethos-u-setup/core_platform/patches/0001-Add-.data-fixup-from-Corestone-300.patch b/examples/arm/ethos-u-setup/core_platform/patches/0001-Add-.data-fixup-from-Corestone-300.patch
deleted file mode 100644
index f2df3350d0..0000000000
--- a/examples/arm/ethos-u-setup/core_platform/patches/0001-Add-.data-fixup-from-Corestone-300.patch
+++ /dev/null
@@ -1,24 +0,0 @@
-From 162ea6b51bd94fabf623cc6b63cf271497eaff8d Mon Sep 17 00:00:00 2001
-From: =?UTF-8?q?Per=20=C3=85strand?= <per.astrand@arm.com>
-Date: Fri, 13 Sep 2024 11:47:03 +0200
-Subject: [PATCH] Add .data fixup from Corestone-300
-
----
- targets/corstone-320/platform.ld | 1 +
- 1 file changed, 1 insertion(+)
-
-diff --git a/targets/corstone-320/platform.ld b/targets/corstone-320/platform.ld
-index 2010d14..fb4e7b7 100644
---- a/targets/corstone-320/platform.ld
-+++ b/targets/corstone-320/platform.ld
-@@ -77,6 +77,7 @@ PHDRS
-     rom_boot PT_LOAD;
-     rom_exec PT_LOAD;
-     rom_dram PT_LOAD;
-+    data     PT_LOAD; /* HACK: New prog header for .data (and friends) going in DTCM */
-     null     PT_NULL;
- }
- 
--- 
-2.39.3 (Apple Git-146)
-
diff --git a/examples/arm/ethos-u-setup/core_platform/patches/0001-Move-rodata-to-the-DDR.patch b/examples/arm/ethos-u-setup/core_platform/patches/0001-Move-rodata-to-the-DDR.patch
new file mode 100644
index 0000000000..4467185ae7
--- /dev/null
+++ b/examples/arm/ethos-u-setup/core_platform/patches/0001-Move-rodata-to-the-DDR.patch
@@ -0,0 +1,34 @@
+From 0fb46c2fe4a072546f87c6cb9202d5001f1eb9c5 Mon Sep 17 00:00:00 2001
+From: George Gekov <george.gekov@arm.com>
+Date: Mon, 18 Nov 2024 11:24:11 +0000
+Subject: [PATCH] Move rodata to the DDR
+
+---
+ targets/corstone-300/platform.ld | 4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+diff --git a/targets/corstone-300/platform.ld b/targets/corstone-300/platform.ld
+index b458fc6..8d4bc73 100644
+--- a/targets/corstone-300/platform.ld
++++ b/targets/corstone-300/platform.ld
+@@ -154,7 +154,7 @@ SECTIONS
+     *(SORT(.dtors.*))
+     *(.dtors)
+ 
+-    *(.rodata*)
++
+ 
+     KEEP(*(.eh_frame*))
+   } > ITCM :rom_exec
+@@ -280,7 +280,7 @@ SECTIONS
+ #endif
+     * (expected_output_data_sec)
+     * (sec_command_stream, sec_weight_data, sec_input_data)
+-
++    *(.rodata*)
+     * (ethosu_core_in_queue)
+     * (ethosu_core_out_queue)
+     . = ALIGN(4);
+-- 
+2.25.1
+
diff --git a/examples/arm/ethos-u-setup/core_platform/patches/0001-New-phdr-for-.data-section.patch b/examples/arm/ethos-u-setup/core_platform/patches/0001-New-phdr-for-.data-section.patch
deleted file mode 100644
index d3ece70d6c..0000000000
--- a/examples/arm/ethos-u-setup/core_platform/patches/0001-New-phdr-for-.data-section.patch
+++ /dev/null
@@ -1,33 +0,0 @@
-From fc2ff3e005999ec185a1ae20c78c06a45651f5bc Mon Sep 17 00:00:00 2001
-From: Digant Desai <digantdesai@meta.com>
-Date: Mon, 2 Oct 2023 20:39:39 -0700
-Subject: [PATCH 1/2] New phdr for .data section
-
----
- targets/corstone-300/platform.ld | 3 ++-
- 1 file changed, 2 insertions(+), 1 deletion(-)
-
-diff --git a/targets/corstone-300/platform.ld b/targets/corstone-300/platform.ld
-index 8d77329..8de77c4 100644
---- a/targets/corstone-300/platform.ld
-+++ b/targets/corstone-300/platform.ld
-@@ -94,6 +94,7 @@ PHDRS
- {
-     rom_exec PT_LOAD;
-     rom_dram PT_LOAD;
-+    data     PT_LOAD; /* HACK: New prog header for .data (and friends) going in DTCM */
-     null     PT_NULL;
- }
- 
-@@ -247,7 +248,7 @@ SECTIONS
-     /* All data end */
-     __data_end__ = .;
- 
--  } > DTCM :rom_exec
-+  } > DTCM :data
- 
-   .sram.bss :
-   {
--- 
-2.34.1
-
diff --git a/examples/arm/ethos-u-setup/core_platform/patches/0003-Make-ITCM-1MB.patch b/examples/arm/ethos-u-setup/core_platform/patches/0003-Make-ITCM-1MB.patch
deleted file mode 100644
index 54ca9f4c93..0000000000
--- a/examples/arm/ethos-u-setup/core_platform/patches/0003-Make-ITCM-1MB.patch
+++ /dev/null
@@ -1,37 +0,0 @@
-From aa65a514e5860267a6d9d52e80b1f8e03c720c6c Mon Sep 17 00:00:00 2001
-From: Zingo Andersen <zingo.andersen@arm.com>
-Date: Tue, 4 Jun 2024 06:20:14 +0200
-Subject: [PATCH 3/3] Make ITCM 1MB
-
-Signed-off-by: Zingo Andersen <zingo.andersen@arm.com>
----
- targets/corstone-300/platform.ld | 6 +++---
- 1 file changed, 3 insertions(+), 3 deletions(-)
-
-diff --git a/targets/corstone-300/platform.ld b/targets/corstone-300/platform.ld
-index 476a2f8..080cc5e 100644
---- a/targets/corstone-300/platform.ld
-+++ b/targets/corstone-300/platform.ld
-@@ -46,8 +46,8 @@
-  * +-----------------------+-------------+-------------+----+--------------------------------------+
-  * |  Memory region name   | Base addr   |    Size     |IDAU|  MCC load address + remarks          |
-  * +-----------------------+-------------+-------------+----+--------------------------------------+
-- * | ITCM                  | 0x0000_0000 | 0x0008_0000 | NS | 0x0000_0000; 512 kiB                 |
-- * | ITCM                  | 0x1000_0000 | 0x0008_0000 | S  | Secure alias for NS ITCM             |
-+ * | ITCM                  | 0x0000_0000 | 0x0010_0000 | NS | 0x0000_0000; 1 MiB                   |
-+ * | ITCM                  | 0x1000_0000 | 0x0010_0000 | S  | Secure alias for NS ITCM             |
-  * | FPGA Data SRAM; BRAM  | 0x0100_0000 | 0x0010_0000 | NS | 0x0100_0000; 1 MiB                   |
-  * | FPGA data SRAM; BRAM  | 0x1100_0000 | 0x0010_0000 | S  | Secure alias for NS BRAM             |
-  * | DTCM                  | 0x2000_0000 | 0x0008_0000 | NS | 512 kiB; 4 banks of 128k each        |
-@@ -82,7 +82,7 @@ __HEAP_SIZE  = 0x00008000;
- 
- MEMORY
- {
--  ITCM  (rx)  : ORIGIN = 0x10000000, LENGTH = 0x00080000
-+  ITCM  (rx)  : ORIGIN = 0x10000000, LENGTH = 0x00100000
-   BRAM  (rw)  : ORIGIN = 0x11000000, LENGTH = 0x00100000
-   DTCM  (rw)  : ORIGIN = 0x30000000, LENGTH = 0x00080000
-   SRAM  (rw)  : ORIGIN = 0x31000000, LENGTH = 0x00200000
--- 
-2.25.1
-
diff --git a/examples/arm/executor_runner/CMakeLists.txt b/examples/arm/executor_runner/CMakeLists.txt
index 064023a70d..7da3462924 100644
--- a/examples/arm/executor_runner/CMakeLists.txt
+++ b/examples/arm/executor_runner/CMakeLists.txt
@@ -234,6 +234,7 @@ target_link_libraries(
   quantized_kernels
   portable_kernels
   "-Wl,--no-whole-archive"
+  -Xlinker -Map=arm_executor_runner.map
 )
 
 # ET headers and generated headers includes
diff --git a/examples/arm/run.sh b/examples/arm/run.sh
index 9dc95600d5..0e5fa9db34 100755
--- a/examples/arm/run.sh
+++ b/examples/arm/run.sh
@@ -229,7 +229,6 @@ function run_fvp() {
     if [[ ${target} == *"ethos-u55"*  ]]; then
         echo "Running ${elf} for ${target} run with FVP:${fvp_model} num_macs:${num_macs}"
         ${fvp_model}                                            \
-            -C cpu0.CFGITCMSZ=11                                \
             -C ethosu.num_macs=${num_macs}                      \
             -C mps3_board.visualisation.disable-visualisation=1 \
             -C mps3_board.telnetterminal0.start_telnet=0        \
@@ -241,7 +240,6 @@ function run_fvp() {
     elif [[ ${target} == *"ethos-u85"*  ]]; then
         echo "Running ${elf} for ${target} run with FVP:${fvp_model} num_macs:${num_macs}"
     	${fvp_model}                                            \
-            -C mps4_board.subsystem.cpu0.CFGITCMSZ=11           \
             -C mps4_board.subsystem.ethosu.num_macs=${num_macs} \
             -C mps4_board.visualisation.disable-visualisation=1 \
             -C vis_hdlcd.disable_visualisation=1                \
diff --git a/examples/cadence/operators/TARGETS b/examples/cadence/operators/TARGETS
new file mode 100644
index 0000000000..732f1ced09
--- /dev/null
+++ b/examples/cadence/operators/TARGETS
@@ -0,0 +1,26 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+load("@fbcode_macros//build_defs:python_unittest.bzl", "python_unittest")
+
+oncall("odai_jarvis")
+
+
+python_unittest(
+    name = "test_add_op",
+    srcs = [
+        "test_add_op.py",
+    ],
+    typing = True,
+    supports_static_listing = False,
+    deps = [
+        "fbsource//third-party/pypi/parameterized:parameterized",
+        "//caffe2:torch",
+        "//executorch/backends/cadence/aot:ops_registrations",
+        "//executorch/backends/cadence/aot:export_example",
+        "//executorch/backends/cadence/aot:compiler",
+    ],
+)
diff --git a/examples/cadence/operators/test_add_op.py b/examples/cadence/operators/test_add_op.py
new file mode 100644
index 0000000000..7799fe624b
--- /dev/null
+++ b/examples/cadence/operators/test_add_op.py
@@ -0,0 +1,117 @@
+# (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
+
+import unittest
+from typing import Tuple
+
+from parameterized import parameterized
+
+from executorch.backends.cadence.aot.ops_registrations import *  # noqa
+
+import torch
+import torch.nn as nn
+from executorch.backends.cadence.aot.export_example import export_model
+
+
+class ATenOpTestCases(unittest.TestCase):
+    # pyre-fixme[16]: Module `parameterized.parameterized` has no attribute `expand`.
+    @parameterized.expand(
+        [
+            [(7, 5, 6), (7, 5, 6)],
+            [(7, 5, 6), (1)],
+            [(1), (7, 5, 6)],
+            [(1), (7, 5, 6), 2.23],
+            [(1), (7, 5, 6), -1.0],
+            [(1), (7, 5, 6), -2.23],
+            [(7, 5, 6), (7, 5, 6), 1.23],
+            [(6, 7), (6, 7)],
+            [(6, 7), (6, 7), 2],
+            # Broadcast tests (should be optimized on G3)
+            [(1, 32, 64), (1, 1, 64)],
+            [(1, 32, 64), (64)],
+            [(1, 1, 32), (32)],
+            [(16, 1, 16), (1, 1, 16)],
+            [(16, 1, 16), (16)],
+            [(1, 4, 8, 8), (1, 1, 8, 8)],
+            [(1, 4, 8, 8), (8, 8)],
+            # Broadcast tests (should go to portable ops)
+            [(1, 10, 1, 8), (4, 1, 4, 1)],
+            [(1, 1, 16), (1, 8, 1), 2.5],
+            # # aten.upsample_nearest2d tests
+            [(5, 6, 6, 8), (5, 6, 6, 8)],
+            [(1, 1, 12, 16), (1, 1, 12, 16)],
+        ]
+    )
+    def test_aten_add_out(
+        self, Xshape: Tuple[int], Yshape: Tuple[int], alpha: float = 1
+    ) -> None:
+        class AddTensor(nn.Module):
+            def __init__(self, alpha: float):
+                super().__init__()
+                self.alpha = alpha
+
+            def forward(self, x: torch.Tensor, y: torch.Tensor):
+                return torch.add(x, y, alpha=self.alpha)
+
+        model = AddTensor(alpha)
+
+        X = torch.randn(Xshape)
+        Y = torch.randn(Yshape)
+
+        model.eval()
+        export_model(
+            model, (X, Y), file_name=self._testMethodName, run_and_compare=False
+        )
+
+    # pyre-fixme[16]: Module `parameterized.parameterized` has no attribute `expand`.
+    @parameterized.expand(
+        [
+            [(7, 5, 6), (7, 5, 6)],
+            [(7, 5, 6), (1)],
+            [(1), (7, 5, 6)],
+            [(1), (7, 5, 6), 2.23],
+            [(1), (7, 5, 6), -1.0],
+            [(1), (7, 5, 6), -2.23],
+            [(7, 5, 6), (7, 5, 6), 1.23],
+            [(6, 7), (6, 7)],
+            [(6, 7), (6, 7), 2],
+            # Broadcast tests (should be optimized on G3)
+            [(1, 32, 64), (1, 1, 64)],
+            [(1, 32, 64), (64)],
+            [(1, 1, 32), (32)],
+            [(16, 1, 16), (1, 1, 16)],
+            [(16, 1, 16), (16)],
+            [(1, 4, 8, 8), (1, 1, 8, 8)],
+            [(1, 4, 8, 8), (8, 8)],
+            # Broadcast tests (should go to portable ops)
+            [(1, 10, 1, 8), (4, 1, 4, 1)],
+            [(1, 1, 16), (1, 8, 1), 2.5],
+            # # aten.upsample_nearest2d tests
+            [(5, 6, 6, 8), (5, 6, 6, 8)],
+            [(1, 1, 12, 16), (1, 1, 12, 16)],
+        ]
+    )
+    def test_aten_add_scalar_out(
+        self, Xshape: Tuple[int], Yshape: Tuple[int], alpha: float = 1
+    ) -> None:
+        # Tensor-Scalar addition
+        class AddScalar(nn.Module):
+            def __init__(self, alpha: float):
+                super().__init__()
+                self.alpha = alpha
+
+            def forward(self, x: torch.Tensor, y: float):
+                return torch.add(x, y, alpha=self.alpha)
+
+        model = AddScalar(alpha)
+
+        X = torch.randn(Xshape)
+        Y = 2.34
+
+        model.eval()
+        export_model(
+            model, (X, Y), file_name=self._testMethodName, run_and_compare=False
+        )
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/examples/models/__init__.py b/examples/models/__init__.py
index 706b0105af..c78106668e 100644
--- a/examples/models/__init__.py
+++ b/examples/models/__init__.py
@@ -19,6 +19,7 @@
     "llama2": ("llama", "Llama2Model"),
     "llama": ("llama", "Llama2Model"),
     "llama3_2_vision_encoder": ("llama3_2_vision", "FlamingoVisionEncoderModel"),
+    # TODO: This take too long to export on both Linux and MacOS (> 6 hours)
     # "llama3_2_text_decoder": ("llama3_2_vision", "Llama3_2Decoder"),
     "lstm": ("lstm", "LSTMModel"),
     "mobilebert": ("mobilebert", "MobileBertModelExample"),
diff --git a/examples/models/llama/TARGETS b/examples/models/llama/TARGETS
index cf387bfab2..284520d4d5 100644
--- a/examples/models/llama/TARGETS
+++ b/examples/models/llama/TARGETS
@@ -93,6 +93,7 @@ runtime.python_library(
         "source_transformation/sdpa.py",
         "source_transformation/spin_quant.py",
         "source_transformation/vulkan_rope.py",
+        "source_transformation/attention_sink.py",
     ],
     _is_external_target = True,
     base_module = "executorch.examples.models.llama",
@@ -213,3 +214,16 @@ runtime.python_test(
         "//executorch/examples/models/llama:llama_transformer",
     ],
 )
+
+runtime.python_test(
+    name = "attention_sink_test",
+    srcs = [
+        "source_transformation/test_attention_sink.py",
+    ],
+    supports_static_listing = False,
+    deps = [
+        "fbsource//third-party/pypi/parameterized:parameterized",
+        "//caffe2:torch",
+        ":export_library",
+    ],
+)
diff --git a/examples/models/llama/llama_transformer.py b/examples/models/llama/llama_transformer.py
index 3f8b8dd654..10d660d37a 100644
--- a/examples/models/llama/llama_transformer.py
+++ b/examples/models/llama/llama_transformer.py
@@ -147,6 +147,81 @@ def __post_init__(self):
             self.head_dim = self.dim // self.n_heads
 
 
+class Rope(torch.nn.Module):
+    def __init__(self, params: ModelArgs):
+        super().__init__()
+        self.params = params
+        if self.params.use_hf_rope:
+            self.precompute_freqs_cis = hf_precompute_freqs_cis
+        else:
+            self.precompute_freqs_cis = partial(
+                precompute_freqs_cis, use_scaled=self.params.use_scaled_rope
+            )
+        freqs_cos, freqs_sin = self.precompute_freqs_cis(
+            self.params.head_dim,
+            (
+                self.params.max_seq_len  # Normal llama2.
+                if self.params.ffn_dim_multiplier is None
+                else self.params.max_seq_len * 2  # Sharded checkpoint.
+            ),
+            self.params.rope_freq_base,
+        )
+        self.register_buffer("freqs_cos", freqs_cos, persistent=False)
+        self.register_buffer("freqs_sin", freqs_sin, persistent=False)
+        if self.params.use_hf_rope:
+            self.apply_rotary_emb = hf_apply_rotary_emb
+        else:
+            self.apply_rotary_emb = RotaryEmbedding()
+
+    def forward(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        freqs_cos: torch.Tensor,
+        freqs_sin: torch.Tensor,
+    ):
+        return self.apply_rotary_emb(q, k, freqs_cos, freqs_sin)
+
+    def get_freqs(self, input_pos: Optional[torch.Tensor], seq_len: int):
+        """
+        Get the precomputed frequencies for the given input position and sequence length.
+
+        Args:
+            input_pos (torch.Tensor): The input position tensor.
+            seq_len (int): The sequence length.
+
+        Returns:
+            Tuple[torch.Tensor, torch.Tensor]: The precomputed frequencies for the given input position and sequence length.
+        """
+        if self.params.use_kv_cache:
+            assert (
+                input_pos is not None
+            ), "input_pos must be provided when use_kv_cache is True"
+
+            if self.params.enable_dynamic_shape:
+                # when KV cache is used, seqlen is most likely 1. We want to slice from the start_pos.
+                input_pos_item = input_pos[-1].item()
+                torch._check_is_size(input_pos_item)
+                torch._check(input_pos_item < self.params.max_seq_len)
+                # pyre-ignore: Incompatible parameter type [6]: torch.narrow does expect int or Tensor
+                freqs_cos = self.freqs_cos.narrow(0, input_pos_item, seq_len)
+                # pyre-ignore: Incompatible parameter type [6]
+                freqs_sin = self.freqs_sin.narrow(0, input_pos_item, seq_len)
+            else:
+                # When not using dynamic shape, use of the .item results in
+                # symints, due to querying the data from tensor.
+                # this path avoids that for mps backend, although probably mps backend
+                # can support dynamic shape?
+                freqs_cos = self.freqs_cos[input_pos]
+                freqs_sin = self.freqs_sin[input_pos]
+
+        else:
+            assert input_pos is None, "input_pos is unused when use_kv_cache is False"
+            freqs_cos = self.freqs_cos[:seq_len]
+            freqs_sin = self.freqs_sin[:seq_len]
+        return freqs_cos, freqs_sin
+
+
 class KVCache(nn.Module):
     def __init__(
         self,
@@ -266,7 +341,7 @@ def forward(
 
 
 class Attention(nn.Module):
-    def __init__(self, args: ModelArgs, layer_id: int):
+    def __init__(self, args: ModelArgs, layer_id: int, rope: Rope):
         super().__init__()
         self.use_kv_cache = args.use_kv_cache
         self.n_heads = args.n_heads
@@ -287,6 +362,8 @@ def __init__(self, args: ModelArgs, layer_id: int):
 
         self.layer_id = layer_id
 
+        self.rope = rope
+
         causal_mask = torch.tril(
             torch.ones(
                 self.max_seq_len,
@@ -303,7 +380,7 @@ def __init__(self, args: ModelArgs, layer_id: int):
                 args.max_seq_len,
                 self.n_kv_heads,
                 self.head_dim,
-                not args.use_sdpa_with_kv_cache_op,  # if we are using the custom op dont transpose the cache. Expect untransposed q k v
+                not args.use_sdpa_with_kv_cache_op,  # if we are using the custom op don't transpose the cache. Expect untransposed q k v
                 args.enable_dynamic_shape,
             )
             self.SDPA = SDPA(
@@ -314,10 +391,6 @@ def __init__(self, args: ModelArgs, layer_id: int):
                 max_seq_len=self.max_seq_len,
                 enable_dynamic_shape=args.enable_dynamic_shape,
             )
-        if args.use_hf_rope:
-            self.apply_rotary_emb = hf_apply_rotary_emb
-        else:
-            self.apply_rotary_emb = RotaryEmbedding()
 
     def forward(
         self,
@@ -336,7 +409,7 @@ def forward(
         v = v.view(bsz, seqlen, self.n_local_kv_heads, self.head_dim)
 
         # RoPE relative positional embeddings
-        q, k = self.apply_rotary_emb(q, k, freqs_cos, freqs_sin)
+        q, k = self.rope.forward(q, k, freqs_cos, freqs_sin)
 
         if self.use_kv_cache:
             assert input_pos is not None
@@ -424,13 +497,13 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
 
 
 class TransformerBlock(nn.Module):
-    def __init__(self, layer_id: int, args: ModelArgs):
+    def __init__(self, layer_id: int, args: ModelArgs, rope: Rope):
         super().__init__()
         self.use_kv_cache = args.use_kv_cache
         self.n_heads = args.n_heads
         self.dim = args.dim
         self.head_dim = args.head_dim
-        self.attention = Attention(args, layer_id)
+        self.attention = Attention(args, layer_id, rope)
         if args.moe:
             self.block_sparse_moe = MOEFeedForward(args)
         else:
@@ -459,9 +532,10 @@ def __init__(self, params: ModelArgs):
         self.n_layers = params.n_layers
 
         self.tok_embeddings = nn.Embedding(params.vocab_size, params.dim)
+        self.rope = Rope(params)
         self.layers = torch.nn.ModuleList()
         for layer_id in range(params.n_layers):
-            self.layers.append(TransformerBlock(layer_id, params))
+            self.layers.append(TransformerBlock(layer_id, params, self.rope))
         self.norm = RMSNorm(params.dim, eps=params.norm_eps)
         self.output = nn.Linear(params.dim, params.vocab_size, bias=False)
         self.use_kv_cache = params.use_kv_cache
@@ -469,23 +543,6 @@ def __init__(self, params: ModelArgs):
         self.max_seq_len = params.max_seq_len
         self.input_prune_map = params.input_prune_map
         self.output_prune_map = params.output_prune_map
-        if params.use_hf_rope:
-            self.precompute_freqs_cis = hf_precompute_freqs_cis
-        else:
-            self.precompute_freqs_cis = partial(
-                precompute_freqs_cis, use_scaled=params.use_scaled_rope
-            )
-        freqs_cos, freqs_sin = self.precompute_freqs_cis(
-            params.head_dim,
-            (
-                params.max_seq_len  # Normal llama2.
-                if params.ffn_dim_multiplier is None
-                else params.max_seq_len * 2  # Sharded checkpoint.
-            ),
-            params.rope_freq_base,
-        )
-        self.register_buffer("freqs_cos", freqs_cos, persistent=False)
-        self.register_buffer("freqs_sin", freqs_sin, persistent=False)
 
     def forward(
         self,
@@ -502,33 +559,7 @@ def forward(
         if tokens is not None and h is None:
             h = self.tok_embeddings(tokens)
         seqlen = h.shape[1]
-
-        if self.use_kv_cache:
-            assert (
-                input_pos is not None
-            ), "input_pos must be provided when use_kv_cache is True"
-
-            if self.params.enable_dynamic_shape:
-                # when KV cache is used, seqlen is most likely 1. We want to slice from the start_pos.
-                input_pos_item = input_pos[-1].item()
-                torch._check_is_size(input_pos_item)
-                torch._check(input_pos_item < self.params.max_seq_len)
-                # pyre-ignore: Incompatible parameter type [6]: torch.narrow does expect int or Tensor
-                freqs_cos = self.freqs_cos.narrow(0, input_pos_item, seqlen)
-                # pyre-ignore: Incompatible parameter type [6]
-                freqs_sin = self.freqs_sin.narrow(0, input_pos_item, seqlen)
-            else:
-                # When not using dynamic shape, use of the .item results in
-                # symints, due to querying the data from tensor.
-                # this path avoids that for mps backend, although probably mps backend
-                # can support dynamic shape?
-                freqs_cos = self.freqs_cos[input_pos]
-                freqs_sin = self.freqs_sin[input_pos]
-
-        else:
-            assert input_pos is None, "input_pos is unused when use_kv_cache is False"
-            freqs_cos = self.freqs_cos[:seqlen]
-            freqs_sin = self.freqs_sin[:seqlen]
+        freqs_cos, freqs_sin = self.rope.get_freqs(input_pos, seqlen)
 
         for layer in self.layers:
             h = layer(
diff --git a/examples/models/llama/rope.py b/examples/models/llama/rope.py
index 0383c79898..1445787f5e 100644
--- a/examples/models/llama/rope.py
+++ b/examples/models/llama/rope.py
@@ -92,6 +92,22 @@ def apply_rotary_emb(
     return xq_out.type_as(xq), xk_out.type_as(xk)
 
 
+def apply_rotary_emb_to_k(
+    xk: torch.Tensor, freqs_cos: torch.Tensor, freqs_sin: torch.Tensor
+) -> torch.Tensor:
+    xk_r, xk_i = xk.float().reshape(xk.shape[:-1] + (-1, 2)).unbind(-1)
+
+    freqs_cos = reshape_for_broadcast(freqs_cos, xk_r)
+    freqs_sin = reshape_for_broadcast(freqs_sin, xk_r)
+
+    xk_out_r = xk_r * freqs_cos - xk_i * freqs_sin
+    xk_out_i = xk_r * freqs_sin + xk_i * freqs_cos
+
+    xk_out = torch.stack([xk_out_r, xk_out_i], dim=-1).flatten(3)
+
+    return xk_out.type_as(xk)
+
+
 class RotaryEmbedding(torch.nn.Module):
     def __init__(self):
         super().__init__()
@@ -160,3 +176,28 @@ def hf_apply_rotary_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
     q_embed = (q * cos) + (rotate_half(q) * sin)
     k_embed = (k * cos) + (rotate_half(k) * sin)
     return q_embed, k_embed
+
+
+def hf_apply_rotary_emb_to_k(k, cos, sin, position_ids=None, unsqueeze_dim=1):
+    """Applies Rotary Position Embedding to the key tensors.
+
+    Args:
+        k (`torch.Tensor`): The key tensor.
+        cos (`torch.Tensor`): The cosine part of the rotary embedding.
+        sin (`torch.Tensor`): The sine part of the rotary embedding.
+        position_ids (`torch.Tensor`, *optional*):
+            Deprecated and unused.
+        unsqueeze_dim (`int`, *optional*, defaults to 1):
+            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+            sin[position_ids] so that they can be properly broadcasted to the dimensions of k. For example, note
+            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if
+            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+            cos[position_ids] and sin[position_ids] broadcastable to the shapes of k. Similarly, if k have
+            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+    Returns:
+        `torch.Tensor` the key tensor rotated using the Rotary Position Embedding.
+    """
+    cos = cos.unsqueeze(unsqueeze_dim)
+    sin = sin.unsqueeze(unsqueeze_dim)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return k_embed
diff --git a/examples/models/llama/source_transformation/attention_sink.py b/examples/models/llama/source_transformation/attention_sink.py
new file mode 100644
index 0000000000..8f4fd1ebd2
--- /dev/null
+++ b/examples/models/llama/source_transformation/attention_sink.py
@@ -0,0 +1,89 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# Components for supporting Attention Sink. See
+# https://arxiv.org/abs/2309.17453 for more details about Attention Sink.
+
+from typing import Optional
+
+import torch
+
+from executorch.examples.models.llama.llama_transformer import ModelArgs, Rope
+from executorch.examples.models.llama.rope import (
+    apply_rotary_emb_to_k,
+    hf_apply_rotary_emb_to_k,
+)
+
+
+class RopeWithAttentionSink(Rope):
+    """
+    Rope that helps adjust position encoding when tokens are shifted in KVCache.
+    For AttentionSink, when tokens are shifted in KVCache, we need to use positions
+    in KVCache instead of positions in the actual text.
+    """
+
+    def __init__(
+        self,
+        params: ModelArgs,
+        window_size: int,
+        sink_size: int,
+        eviction_batch_size: int,
+    ):
+        super().__init__(params)
+        if self.params.use_hf_rope:
+            self.apply_rotary_emb_to_k = hf_apply_rotary_emb_to_k
+        else:
+            self.apply_rotary_emb_to_k = apply_rotary_emb_to_k
+        self.max_seq_length = window_size + sink_size
+        assert self.max_seq_length == self.params.max_seq_len
+        self.eviction_batch_size = eviction_batch_size
+        self.position_shift = 0
+
+    def get_freqs(self, input_pos: Optional[torch.Tensor], seq_len: int):
+        assert input_pos is not None
+
+        input_pos_item = input_pos.item()
+        torch._check_is_size(input_pos_item)
+        if input_pos_item + self.position_shift + seq_len > self.max_seq_length:
+            # There are not enough spaces in the cache to store the new tokens.
+            # We need to evict some old tokens and shift some recent tokens.
+            num_to_evict = max(
+                input_pos_item + self.position_shift - self.max_seq_length + seq_len,
+                self.eviction_batch_size,
+            )
+            self.position_shift -= num_to_evict  # pyre-ignore [8]
+        return super().get_freqs(input_pos + self.position_shift, seq_len)
+
+    def rerotate_k(
+        self,
+        k: torch.Tensor,
+        original_position: int,
+        new_position: int,
+    ):
+        """
+        Rerotate k from original_position to new_position. This is done by rerotating
+        k with (new_position * theta - original_position * theta) with the following matrix:
+        (cos(delta), -sin(delta)
+         sin(delta), cos(delta))
+         where delta = new_position * theta - original_position * theta
+
+         The shape of k is (batch_size, seq_len, n_local_heads, head_dim)
+
+         Based on https://github.com/huggingface/transformers/blame/main/src/transformers/cache_utils.py#L961
+        """
+        seq_len = k.shape[1]
+        original_freqs_cos = self.freqs_cos.narrow(0, original_position, seq_len)
+        original_freqs_sin = self.freqs_sin.narrow(0, original_position, seq_len)
+        new_freqs_cos = self.freqs_cos.narrow(0, new_position, seq_len)
+        new_freqs_sin = self.freqs_sin.narrow(0, new_position, seq_len)
+        rerotation_cos = (
+            new_freqs_cos * original_freqs_cos + new_freqs_sin * original_freqs_sin
+        )
+        rerotation_sin = (
+            new_freqs_sin * original_freqs_cos - new_freqs_cos * original_freqs_sin
+        )
+
+        return self.apply_rotary_emb_to_k(k, rerotation_cos, rerotation_sin)
diff --git a/examples/models/llama/source_transformation/rope.py b/examples/models/llama/source_transformation/rope.py
index a2a2264b24..79fb239966 100644
--- a/examples/models/llama/source_transformation/rope.py
+++ b/examples/models/llama/source_transformation/rope.py
@@ -13,23 +13,27 @@ def materialze_broadcast_of_rope_freq_cis(
     module: torch.nn.Module,
 ):
     assert isinstance(module, Transformer)
-    assert module.freqs_cos.dim() == 2
-    dim0 = module.freqs_cos.size(0)
-    dim1 = module.freqs_cos.size(1)
+    assert module.rope.freqs_cos.dim() == 2
+    dim0 = module.rope.freqs_cos.size(0)
+    dim1 = module.rope.freqs_cos.size(1)
     module_attention = module.layers[0].attention
     assert (
         module_attention.n_local_kv_heads == module_attention.n_local_heads
     ), f"For rope freqs to be materialized for broadcast, q, k, v num heads must match. For q got {module_attention.n_kv_heads} for k got {module_attention.n_local_heads} and v got {module_attention.n_local_kv_heads}"
     num_heads = module_attention.n_local_heads
-    module.freqs_cos = module.freqs_cos.view(dim0, 1, dim1)
-    module.freqs_cos = module.freqs_cos.expand(dim0, num_heads, dim1).contiguous()
-    assert module.freqs_sin.dim() == 2
-    assert dim0 == module.freqs_sin.size(
+    module.rope.freqs_cos = module.rope.freqs_cos.view(dim0, 1, dim1)
+    module.rope.freqs_cos = module.rope.freqs_cos.expand(
+        dim0, num_heads, dim1
+    ).contiguous()
+    assert module.rope.freqs_sin.dim() == 2
+    assert dim0 == module.rope.freqs_sin.size(
         0
-    ), f"sin and cos freq table sizes must match. Mismatch found at dim 0: {dim0} vs {module.freqs_sin.size(0)}"
-    assert dim1 == module.freqs_sin.size(
+    ), f"sin and cos freq table sizes must match. Mismatch found at dim 0: {dim0} vs {module.rope.freqs_sin.size(0)}"
+    assert dim1 == module.rope.freqs_sin.size(
         1
-    ), f"sin and cos freq table sizes must match. Mismatch found at dim 1: {dim1} vs {module.freqs_sin.size(1)}"
-    module.freqs_sin = module.freqs_sin.view(dim0, 1, dim1)
-    module.freqs_sin = module.freqs_sin.expand(dim0, num_heads, dim1).contiguous()
+    ), f"sin and cos freq table sizes must match. Mismatch found at dim 1: {dim1} vs {module.rope.freqs_sin.size(1)}"
+    module.rope.freqs_sin = module.rope.freqs_sin.view(dim0, 1, dim1)
+    module.rope.freqs_sin = module.rope.freqs_sin.expand(
+        dim0, num_heads, dim1
+    ).contiguous()
     return module
diff --git a/examples/models/llama/source_transformation/test_attention_sink.py b/examples/models/llama/source_transformation/test_attention_sink.py
new file mode 100644
index 0000000000..8eaa992dc3
--- /dev/null
+++ b/examples/models/llama/source_transformation/test_attention_sink.py
@@ -0,0 +1,120 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import unittest
+
+import torch
+from executorch.examples.models.llama.llama_transformer import ModelArgs
+
+from executorch.examples.models.llama.source_transformation.attention_sink import (
+    RopeWithAttentionSink,
+)
+from parameterized import parameterized
+
+
+class RopeWithAttentionSinkTest(unittest.TestCase):
+
+    def _init_rope(self, params: ModelArgs, eviction_batch_size: int):
+        return RopeWithAttentionSink(
+            params=params,
+            window_size=252,
+            sink_size=4,
+            eviction_batch_size=eviction_batch_size,
+        )
+
+    def setUp(self):
+        torch.manual_seed(42)
+        self.params = ModelArgs(
+            use_kv_cache=True, enable_dynamic_shape=True, max_seq_len=256
+        )
+        self.rope_with_attention_sink = self._init_rope(
+            params=self.params, eviction_batch_size=1
+        )
+
+    @parameterized.expand(
+        [
+            [0, 10, 1, 0],  # No shift
+            [250, 10, 1, 246],  # Some shift
+            [256, 10, 1, 246],  # All shift
+            [0, 10, 30, 0],  # No shift with batch eviction
+            [250, 10, 30, 220],  # Some shift with batch eviction
+            [256, 10, 30, 226],  # All shift with batch eviction
+        ]
+    )
+    def test_get_freqs(
+        self, input_pos, seq_len, eviction_batch_size, expected_result_pos
+    ):
+        self.rope_with_attention_sink = self._init_rope(
+            params=self.params, eviction_batch_size=eviction_batch_size
+        )
+
+        freqs_cos, freqs_sin = self.rope_with_attention_sink.get_freqs(
+            input_pos=torch.tensor([input_pos], dtype=torch.int32),
+            seq_len=seq_len,
+        )
+
+        torch.testing.assert_close(
+            freqs_cos,
+            self.rope_with_attention_sink.freqs_cos.narrow(
+                0, expected_result_pos, seq_len
+            ),
+        )
+        torch.testing.assert_close(
+            freqs_sin,
+            self.rope_with_attention_sink.freqs_sin.narrow(
+                0, expected_result_pos, seq_len
+            ),
+        )
+
+    @parameterized.expand(
+        [
+            [128, 127],  # Rotate left
+            [128, 128],  # No rotation
+            [128, 129],  # Rotate right
+        ]
+    )
+    def test_rotate(self, original_position, new_position):
+        seq_len = 32
+
+        q = torch.rand(
+            1, seq_len, self.params.n_heads, self.params.head_dim, dtype=torch.float32
+        )
+        k = torch.rand(
+            1,
+            seq_len,
+            self.params.n_heads,
+            self.params.head_dim,
+            dtype=torch.float32,
+        )
+        freqs_cos, freqs_sin = self.rope_with_attention_sink.get_freqs(
+            input_pos=torch.tensor([original_position], dtype=torch.int32),
+            seq_len=seq_len,
+        )
+        _, pre_rotated_k = self.rope_with_attention_sink.forward(
+            q=q,
+            k=k,
+            freqs_cos=freqs_cos,
+            freqs_sin=freqs_sin,
+        )
+
+        rerotated_k = self.rope_with_attention_sink.rerotate_k(
+            k=pre_rotated_k,
+            original_position=original_position,
+            new_position=new_position,
+        )
+
+        freqs_cos, freqs_sin = self.rope_with_attention_sink.get_freqs(
+            input_pos=torch.tensor([new_position], dtype=torch.int32),
+            seq_len=seq_len,
+        )
+        _, expected_k = self.rope_with_attention_sink.forward(
+            q=q,
+            k=k,
+            freqs_cos=freqs_cos,
+            freqs_sin=freqs_sin,
+        )
+
+        torch.testing.assert_close(rerotated_k, expected_k)
diff --git a/examples/models/llama3_2_vision/text_decoder/model.py b/examples/models/llama3_2_vision/text_decoder/model.py
index 2d9c41b603..8cdbd8628a 100644
--- a/examples/models/llama3_2_vision/text_decoder/model.py
+++ b/examples/models/llama3_2_vision/text_decoder/model.py
@@ -108,6 +108,7 @@ def __init__(self, **kwargs):
             rope_base=params["rope_theta"],
             intermediate_dim=params["intermediate_dim"],
         )
+        self.model_.requires_grad_(False)
 
         # Source transformation for MultiHeadAttention
         self.model_ = replace_mha_with_inference_mha(self.model_)
@@ -167,11 +168,22 @@ def get_example_inputs(self):
     def get_example_kwarg_inputs(self):
         # For export we must use the prefill versions of the
         # causal mask and input_pos.
+
+        # Make input_pos and mask contiguous in memory.
+        input_pos = self.input_pos[None, : self.n_tokens]
+        mask = self.causal_mask[None, : self.n_tokens]
+        contiguous_input_pos = torch.empty_like(
+            input_pos, memory_format=torch.contiguous_format
+        )
+        contiguous_input_pos.data.copy_(input_pos.data)
+        contiguous_mask = torch.empty_like(mask, memory_format=torch.contiguous_format)
+        contiguous_mask.data.copy_(mask.data)
+
         # Hardcoding # of tiles to be 2. image tokens per tile is 1601.
         if self.use_kv_cache:
             return {
-                "input_pos": self.input_pos[None, : self.n_tokens],
-                "mask": self.causal_mask[None, : self.n_tokens],
+                "input_pos": contiguous_input_pos,
+                "mask": contiguous_mask,
                 "encoder_input": torch.randn(
                     1, self.encoder_max_seq_len, self.model_.dim, dtype=self.dtype
                 ),
diff --git a/extension/llm/custom_ops/CMakeLists.txt b/extension/llm/custom_ops/CMakeLists.txt
index 36b03a480f..811eb87ac6 100644
--- a/extension/llm/custom_ops/CMakeLists.txt
+++ b/extension/llm/custom_ops/CMakeLists.txt
@@ -109,26 +109,5 @@ if(EXECUTORCH_BUILD_KERNELS_CUSTOM_AOT)
            ${_common_compile_options} -DET_USE_THREADPOOL
   )
 
-  # pip wheels will need to be able to find the dependent libraries. On Linux,
-  # the .so has non-absolute dependencies on libs like "_portable_lib.so"
-  # without paths; as long as we `import torch` first, those dependencies will
-  # work. But Apple dylibs do not support non-absolute dependencies, so we need
-  # to tell the loader where to look for its libraries. The LC_LOAD_DYLIB
-  # entries for the portable_lib libraries will look like
-  # "@rpath/_portable_lib.cpython-310-darwin.so", so we can add an LC_RPATH
-  # entry to look in a directory relative to the installed location of our
-  # _portable_lib.so file. To see these LC_* values, run `otool -l
-  # libcustom_ops_aot_lib.dylib`.
-  if(APPLE)
-    set_target_properties(
-      custom_ops_aot_lib
-      PROPERTIES # Assume this library will be installed in
-                 # <site-packages>/executorch/extension/llm/custom_ops/, and the
-                 # _portable_lib.so is installed in
-                 # <site-packages>/executorch/extension/pybindings/
-                 BUILD_RPATH "@loader_path/../../pybindings"
-                 INSTALL_RPATH "@loader_path/../../pybindings"
-    )
-  endif()
   install(TARGETS custom_ops_aot_lib DESTINATION lib)
 endif()
diff --git a/extension/llm/export/quantizer_lib.py b/extension/llm/export/quantizer_lib.py
index ba281864a9..3a9eebd2c3 100644
--- a/extension/llm/export/quantizer_lib.py
+++ b/extension/llm/export/quantizer_lib.py
@@ -184,14 +184,12 @@ def get_qnn_quantizer(
         )
         qnn_quantizer.set_per_channel_conv_quant(enable=False)
         qnn_quantizer.set_per_channel_linear_quant(enable=False)
-        # pyre-ignore: Undefined attribute [16]: Module `executorch.backends` has no attribute `qualcomm`.
         qnn_quantizer.set_quant_config(
             quant_dtype, is_qat=is_qat, act_observer=MinMaxObserver
         )
     elif quant_config == "16a4w":
         # pyre-ignore: Undefined attribute [16]: Module `executorch.backends` has no attribute `qualcomm`.
         quant_dtype = QuantDtype.use_16a4w
-        # pyre-ignore: Undefined attribute [16]: Module `executorch.backends` has no attribute `qualcomm`.
         qnn_quantizer.set_quant_config(
             quant_dtype, is_qat=is_qat, act_observer=MinMaxObserver
         )
diff --git a/extension/training/module/training_module.h b/extension/training/module/training_module.h
index b31463a68f..9e7aa49cac 100644
--- a/extension/training/module/training_module.h
+++ b/extension/training/module/training_module.h
@@ -26,7 +26,8 @@ namespace training {
  * A facade class for loading programs for on-device training and executing
  * methods within them.
  */
-class ET_EXPERIMENTAL TrainingModule final : executorch::extension::Module {
+class ET_EXPERIMENTAL TrainingModule final
+    : public executorch::extension::Module {
  public:
   explicit TrainingModule(
       std::unique_ptr<runtime::DataLoader> data_loader,