From 1c7d94e3aa6fee30b3a4f618da9cd90129bc1633 Mon Sep 17 00:00:00 2001
From: Hansong <107070759+kirklandsign@users.noreply.github.com>
Date: Tue, 26 Nov 2024 15:24:56 -0800
Subject: [PATCH 01/27] Rollout ghstack_land bot to everyone

Now it's in a good shape, let's expand it to everyone.

Pull Request resolved: https://github.com/pytorch/executorch/pull/7092

Original discussion: https://github.com/pytorch/executorch/pull/6270#discussion_r1805490087
---
 .github/workflows/ghstack_land.yml | 16 +---------------
 1 file changed, 1 insertion(+), 15 deletions(-)

diff --git a/.github/workflows/ghstack_land.yml b/.github/workflows/ghstack_land.yml
index e3b02d2a94..09bd2a7ced 100644
--- a/.github/workflows/ghstack_land.yml
+++ b/.github/workflows/ghstack_land.yml
@@ -3,21 +3,7 @@ on:
   pull_request:
     types: [closed]
     branches:
-      - 'gh/cccclai/[0-9]+/base'
-      - 'gh/dbort/[0-9]+/base'
-      - 'gh/dvorjackz/[0-9]+/base'
-      - 'gh/guangy10/[0-9]+/base'
-      - 'gh/helunwencser/[0-9]+/base'
-      - 'gh/jorgep31415/[0-9]+/base'
-      - 'gh/kimishpatel/[0-9]+/base'
-      - 'gh/kirklandsign/[0-9]+/base'
-      - 'gh/larryliu0820/[0-9]+/base'
-      - 'gh/lucylq/[0-9]+/base'
-      - 'gh/manuelcandales/[0-9]+/base'
-      - 'gh/mcr229/[0-9]+/base'
-      - 'gh/swolchok/[0-9]+/base'
-      - 'gh/SS-JIA/[0-9]+/base'
-      - 'gh/trivedivivek/[0-9]+/base'
+      - 'gh/*/[0-9]+/base'
 
 jobs:
   ghstack_merge_to_main:

From 9b29b4b8ee2a52972480dea05956a3350a78ef1d Mon Sep 17 00:00:00 2001
From: George Hong <georgehong@meta.com>
Date: Tue, 26 Nov 2024 16:20:16 -0800
Subject: [PATCH 02/27] Update training module to have super class methods
 accessible (#7082)

Update training module to have super class methods accessible (#7082)

Summary:

This is needed so the training module has access to non-training methods (e.g. constant string return methods).

Reviewed By: JacobSzwejbka

Differential Revision: D66419247
---
 extension/training/module/training_module.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/extension/training/module/training_module.h b/extension/training/module/training_module.h
index b31463a68f..9e7aa49cac 100644
--- a/extension/training/module/training_module.h
+++ b/extension/training/module/training_module.h
@@ -26,7 +26,8 @@ namespace training {
  * A facade class for loading programs for on-device training and executing
  * methods within them.
  */
-class ET_EXPERIMENTAL TrainingModule final : executorch::extension::Module {
+class ET_EXPERIMENTAL TrainingModule final
+    : public executorch::extension::Module {
  public:
   explicit TrainingModule(
       std::unique_ptr<runtime::DataLoader> data_loader,

From dedf77bd3082756c6ff13a16e1265f3f481bc1ed Mon Sep 17 00:00:00 2001
From: Mengwei Liu <larryliu0820@users.noreply.github.com>
Date: Tue, 26 Nov 2024 16:43:09 -0800
Subject: [PATCH 03/27] Fix shared library rpath once for all (#7096)

Test

Summary:

Test Plan:

Reviewers:

Subscribers:

Tasks:

Tags:
---
 CMakeLists.txt                          | 56 +++++++------------------
 extension/llm/custom_ops/CMakeLists.txt | 21 ----------
 2 files changed, 16 insertions(+), 61 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index b3b80b4e41..f960dced37 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -682,6 +682,22 @@ if(EXECUTORCH_BUILD_PTHREADPOOL
 endif()
 
 if(EXECUTORCH_BUILD_PYBIND)
+  # Setup RPATH.
+  # See https://gitlab.kitware.com/cmake/community/-/wikis/doc/cmake/RPATH-handling
+  if(APPLE)
+    set(CMAKE_MACOSX_RPATH ON)
+    set(_rpath_portable_origin "@loader_path")
+  else()
+    set(_rpath_portable_origin $ORIGIN)
+  endif(APPLE)
+  # Use separate rpaths during build and install phases
+  set(CMAKE_SKIP_BUILD_RPATH  FALSE)
+  # Don't use the install-rpath during the build phase
+  set(CMAKE_BUILD_WITH_INSTALL_RPATH FALSE)
+  set(CMAKE_INSTALL_RPATH "${_rpath_portable_origin}")
+  # Automatically add all linked folders that are NOT in the build directory to
+  # the rpath (per library?)
+  set(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE)
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/third-party/pybind11)
 
   if(NOT EXECUTORCH_BUILD_EXTENSION_DATA_LOADER)
@@ -765,46 +781,6 @@ if(EXECUTORCH_BUILD_PYBIND)
   target_include_directories(portable_lib PRIVATE ${TORCH_INCLUDE_DIRS})
   target_compile_options(portable_lib PUBLIC ${_pybind_compile_options})
   target_link_libraries(portable_lib PRIVATE ${_dep_libs})
-  if(APPLE)
-    # pip wheels will need to be able to find the torch libraries. On Linux, the
-    # .so has non-absolute dependencies on libs like "libtorch.so" without
-    # paths; as long as we `import torch` first, those dependencies will work.
-    # But Apple dylibs do not support non-absolute dependencies, so we need to
-    # tell the loader where to look for its libraries. The LC_LOAD_DYLIB entries
-    # for the torch libraries will look like "@rpath/libtorch.dylib", so we can
-    # add an LC_RPATH entry to look in a directory relative to the installed
-    # location of our _portable_lib.so file. To see these LC_* values, run
-    # `otool -l _portable_lib*.so`.
-    set_target_properties(
-      portable_lib
-      PROPERTIES # Assume that this library will be installed in
-                 # `site-packages/executorch/extension/pybindings`, and that
-                 # the torch libs are in `site-packages/torch/lib`.
-                 BUILD_RPATH "@loader_path/../../../torch/lib"
-                 INSTALL_RPATH "@loader_path/../../../torch/lib"
-                 # Assume <executorch> is the root `site-packages/executorch`
-                 # Need to add <executorch>/extension/llm/custom_ops for
-                 # libcustom_ops_aot_lib.dylib
-                 BUILD_RPATH "@loader_path/../../extension/llm/custom_ops"
-                 INSTALL_RPATH "@loader_path/../../extension/llm/custom_ops"
-                 # Need to add <executorch>/kernels/quantized for
-                 # libquantized_ops_aot_lib.dylib
-                 BUILD_RPATH "@loader_path/../../kernels/quantized"
-                 INSTALL_RPATH "@loader_path/../../kernels/quantized"
-    )
-  else()
-    set_target_properties(
-      portable_lib
-      PROPERTIES
-        # Assume <executorch> is the root `site-packages/executorch`
-        # Need to add <executorch>/extension/llm/custom_ops for
-        # libcustom_ops_aot_lib
-        # Need to add <executorch>/kernels/quantized for
-        # libquantized_ops_aot_lib
-        BUILD_RPATH
-        "$ORIGIN:$ORIGIN/../../extension/llm/custom_ops:$ORIGIN/../../kernels/quantized"
-    )
-  endif()
 
   install(TARGETS portable_lib
           LIBRARY DESTINATION executorch/extension/pybindings
diff --git a/extension/llm/custom_ops/CMakeLists.txt b/extension/llm/custom_ops/CMakeLists.txt
index 36b03a480f..811eb87ac6 100644
--- a/extension/llm/custom_ops/CMakeLists.txt
+++ b/extension/llm/custom_ops/CMakeLists.txt
@@ -109,26 +109,5 @@ if(EXECUTORCH_BUILD_KERNELS_CUSTOM_AOT)
            ${_common_compile_options} -DET_USE_THREADPOOL
   )
 
-  # pip wheels will need to be able to find the dependent libraries. On Linux,
-  # the .so has non-absolute dependencies on libs like "_portable_lib.so"
-  # without paths; as long as we `import torch` first, those dependencies will
-  # work. But Apple dylibs do not support non-absolute dependencies, so we need
-  # to tell the loader where to look for its libraries. The LC_LOAD_DYLIB
-  # entries for the portable_lib libraries will look like
-  # "@rpath/_portable_lib.cpython-310-darwin.so", so we can add an LC_RPATH
-  # entry to look in a directory relative to the installed location of our
-  # _portable_lib.so file. To see these LC_* values, run `otool -l
-  # libcustom_ops_aot_lib.dylib`.
-  if(APPLE)
-    set_target_properties(
-      custom_ops_aot_lib
-      PROPERTIES # Assume this library will be installed in
-                 # <site-packages>/executorch/extension/llm/custom_ops/, and the
-                 # _portable_lib.so is installed in
-                 # <site-packages>/executorch/extension/pybindings/
-                 BUILD_RPATH "@loader_path/../../pybindings"
-                 INSTALL_RPATH "@loader_path/../../pybindings"
-    )
-  endif()
   install(TARGETS custom_ops_aot_lib DESTINATION lib)
 endif()

From 5785fc3e80bddf1af04fda270b869881363e3308 Mon Sep 17 00:00:00 2001
From: JP <46308822+zonglinpeng@users.noreply.github.com>
Date: Tue, 26 Nov 2024 17:46:17 -0800
Subject: [PATCH 04/27] add unit test for op_add (#7087)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

add op_add shapes to generate as binaries (#7087)

Summary:
generates the add model pte’s for cadence to execute on. will use graph builder in later diffs

Test Plan:
Imported from GitHub, without a `Test Plan:` line.
{F1968254537}

Reviewed By: hsharma35

Differential Revision: D66510372

Pulled By: zonglinpeng
---
 backends/cadence/aot/TARGETS              |  20 ++++
 backends/cadence/aot/export_example.py    |  14 +--
 backends/cadence/aot/utils.py             |   3 +-
 backends/cadence/runtime/TARGETS          |   2 +
 examples/cadence/operators/TARGETS        |  26 +++++
 examples/cadence/operators/test_add_op.py | 115 ++++++++++++++++++++++
 6 files changed, 173 insertions(+), 7 deletions(-)
 create mode 100644 examples/cadence/operators/TARGETS
 create mode 100644 examples/cadence/operators/test_add_op.py

diff --git a/backends/cadence/aot/TARGETS b/backends/cadence/aot/TARGETS
index 24b0266911..661f8cf0d4 100644
--- a/backends/cadence/aot/TARGETS
+++ b/backends/cadence/aot/TARGETS
@@ -50,6 +50,26 @@ python_library(
     ],
 )
 
+python_library(
+    name = "export_example",
+    srcs = [
+        "export_example.py",
+    ],
+    deps = [
+        ":passes",
+        ":utils",
+        ":ops_registrations",
+        ":replace_ops",
+        "//caffe2:torch",
+        "//executorch/backends/cadence/aot/quantizer:fusion_pass",
+        "//executorch/backends/cadence/runtime:runtime",
+        "//executorch/backends/cadence/aot/quantizer:quantizer",
+        "//executorch/backends/transforms:decompose_sdpa",
+        "//executorch/backends/transforms:remove_clone_ops",
+        "//executorch/exir:lib",
+        "//executorch/devtools:lib",
+    ],
+)
 
 python_library(
     name = "pass_utils",
diff --git a/backends/cadence/aot/export_example.py b/backends/cadence/aot/export_example.py
index 146d4f806c..4ba5bffc96 100644
--- a/backends/cadence/aot/export_example.py
+++ b/backends/cadence/aot/export_example.py
@@ -60,6 +60,7 @@ def export_model(
     model: nn.Module,
     example_inputs: Tuple[Any, ...],
     file_name: str = "CadenceDemoModel",
+    run_and_compare: bool = True,
 ):
     # create work directory for outputs and model binary
     working_dir = tempfile.mkdtemp(dir="/tmp")
@@ -112,9 +113,10 @@ def export_model(
     )
 
     # TODO: move to test infra
-    runtime.run_and_compare(
-        executorch_prog=exec_prog,
-        inputs=example_inputs,
-        ref_outputs=ref_outputs,
-        working_dir=working_dir,
-    )
+    if run_and_compare:
+        runtime.run_and_compare(
+            executorch_prog=exec_prog,
+            inputs=example_inputs,
+            ref_outputs=ref_outputs,
+            working_dir=working_dir,
+        )
diff --git a/backends/cadence/aot/utils.py b/backends/cadence/aot/utils.py
index e8b64ef567..534b4f0d9f 100644
--- a/backends/cadence/aot/utils.py
+++ b/backends/cadence/aot/utils.py
@@ -162,7 +162,8 @@ def print_ops_info(
 
     # Print the final ops and their counts in a tabular format
     logging.info(
-        tabulate(
+        "\n"
+        + tabulate(
             sorted_ops_count,
             headers=[
                 "Final Operators                                    ",  # one character longer than the longest op name
diff --git a/backends/cadence/runtime/TARGETS b/backends/cadence/runtime/TARGETS
index 1b55a7d541..db3fe0ad1e 100644
--- a/backends/cadence/runtime/TARGETS
+++ b/backends/cadence/runtime/TARGETS
@@ -7,6 +7,8 @@ python_library(
     srcs = [
         "__init__.py",
         "executor.py",
+        "runtime.py",
+        "utils.py"
     ] + glob([
         "xtsc-cfg/**/*",
     ]),
diff --git a/examples/cadence/operators/TARGETS b/examples/cadence/operators/TARGETS
new file mode 100644
index 0000000000..732f1ced09
--- /dev/null
+++ b/examples/cadence/operators/TARGETS
@@ -0,0 +1,26 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+load("@fbcode_macros//build_defs:python_unittest.bzl", "python_unittest")
+
+oncall("odai_jarvis")
+
+
+python_unittest(
+    name = "test_add_op",
+    srcs = [
+        "test_add_op.py",
+    ],
+    typing = True,
+    supports_static_listing = False,
+    deps = [
+        "fbsource//third-party/pypi/parameterized:parameterized",
+        "//caffe2:torch",
+        "//executorch/backends/cadence/aot:ops_registrations",
+        "//executorch/backends/cadence/aot:export_example",
+        "//executorch/backends/cadence/aot:compiler",
+    ],
+)
diff --git a/examples/cadence/operators/test_add_op.py b/examples/cadence/operators/test_add_op.py
new file mode 100644
index 0000000000..5481540b4f
--- /dev/null
+++ b/examples/cadence/operators/test_add_op.py
@@ -0,0 +1,115 @@
+# (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
+
+import unittest
+from typing import Tuple
+
+from parameterized import parameterized
+
+from executorch.backends.cadence.aot.ops_registrations import *  # noqa
+
+import torch
+import torch.nn as nn
+from executorch.backends.cadence.aot.export_example import export_model
+
+
+class ATenOpTestCases(unittest.TestCase):
+    @parameterized.expand(
+        [
+            [(7, 5, 6), (7, 5, 6)],
+            [(7, 5, 6), (1)],
+            [(1), (7, 5, 6)],
+            [(1), (7, 5, 6), 2.23],
+            [(1), (7, 5, 6), -1.0],
+            [(1), (7, 5, 6), -2.23],
+            [(7, 5, 6), (7, 5, 6), 1.23],
+            [(6, 7), (6, 7)],
+            [(6, 7), (6, 7), 2],
+            # Broadcast tests (should be optimized on G3)
+            [(1, 32, 64), (1, 1, 64)],
+            [(1, 32, 64), (64)],
+            [(1, 1, 32), (32)],
+            [(16, 1, 16), (1, 1, 16)],
+            [(16, 1, 16), (16)],
+            [(1, 4, 8, 8), (1, 1, 8, 8)],
+            [(1, 4, 8, 8), (8, 8)],
+            # Broadcast tests (should go to portable ops)
+            [(1, 10, 1, 8), (4, 1, 4, 1)],
+            [(1, 1, 16), (1, 8, 1), 2.5],
+            # # aten.upsample_nearest2d tests
+            [(5, 6, 6, 8), (5, 6, 6, 8)],
+            [(1, 1, 12, 16), (1, 1, 12, 16)],
+        ]
+    )
+    def test_aten_add_out(
+        self, Xshape: Tuple[int], Yshape: Tuple[int], alpha: float = 1
+    ) -> None:
+        class AddTensor(nn.Module):
+            def __init__(self, alpha: float):
+                super().__init__()
+                self.alpha = alpha
+
+            def forward(self, x: torch.Tensor, y: torch.Tensor):
+                return torch.add(x, y, alpha=self.alpha)
+
+        model = AddTensor(alpha)
+
+        X = torch.randn(Xshape)
+        Y = torch.randn(Yshape)
+
+        model.eval()
+        export_model(
+            model, (X, Y), file_name=self._testMethodName, run_and_compare=False
+        )
+
+    @parameterized.expand(
+        [
+            [(7, 5, 6), (7, 5, 6)],
+            [(7, 5, 6), (1)],
+            [(1), (7, 5, 6)],
+            [(1), (7, 5, 6), 2.23],
+            [(1), (7, 5, 6), -1.0],
+            [(1), (7, 5, 6), -2.23],
+            [(7, 5, 6), (7, 5, 6), 1.23],
+            [(6, 7), (6, 7)],
+            [(6, 7), (6, 7), 2],
+            # Broadcast tests (should be optimized on G3)
+            [(1, 32, 64), (1, 1, 64)],
+            [(1, 32, 64), (64)],
+            [(1, 1, 32), (32)],
+            [(16, 1, 16), (1, 1, 16)],
+            [(16, 1, 16), (16)],
+            [(1, 4, 8, 8), (1, 1, 8, 8)],
+            [(1, 4, 8, 8), (8, 8)],
+            # Broadcast tests (should go to portable ops)
+            [(1, 10, 1, 8), (4, 1, 4, 1)],
+            [(1, 1, 16), (1, 8, 1), 2.5],
+            # # aten.upsample_nearest2d tests
+            [(5, 6, 6, 8), (5, 6, 6, 8)],
+            [(1, 1, 12, 16), (1, 1, 12, 16)],
+        ]
+    )
+    def test_aten_add_scalar_out(
+        self, Xshape: Tuple[int], Yshape: Tuple[int], alpha: float = 1
+    ) -> None:
+        # Tensor-Scalar addition
+        class AddScalar(nn.Module):
+            def __init__(self, alpha: float):
+                super().__init__()
+                self.alpha = alpha
+
+            def forward(self, x: torch.Tensor, y: float):
+                return torch.add(x, y, alpha=self.alpha)
+
+        model = AddScalar(alpha)
+
+        X = torch.randn(Xshape)
+        Y = 2.34
+
+        model.eval()
+        export_model(
+            model, (X, Y), file_name=self._testMethodName, run_and_compare=False
+        )
+
+
+if __name__ == "__main__":
+    unittest.main()

From d679ad70a7745c60df581d6b110e6f79c389feb9 Mon Sep 17 00:00:00 2001
From: Max Ren <40742183+mcr229@users.noreply.github.com>
Date: Wed, 27 Nov 2024 02:00:41 -0500
Subject: [PATCH 05/27] Update XNNPACK to 1ed874e65 (#6538)

* Update XNNPACK to c88c8504fd9889c22391f0f3ece6061a7f855cf3

fix bug

* Update test_llama.sh and test_llava.sh to use release mode as default
---
 .ci/scripts/test_llama.sh                        | 11 +++++++----
 .ci/scripts/test_llava.sh                        | 16 ++++++++--------
 .github/workflows/trunk.yml                      |  2 +-
 backends/xnnpack/third-party/XNNPACK             |  2 +-
 backends/xnnpack/third-party/xnnpack.buck.bzl    |  6 ++++--
 .../xnnpack/third-party/xnnpack_src_defs.bzl     | 12 +-----------
 6 files changed, 22 insertions(+), 27 deletions(-)

diff --git a/.ci/scripts/test_llama.sh b/.ci/scripts/test_llama.sh
index e109845547..5e5ed588a2 100644
--- a/.ci/scripts/test_llama.sh
+++ b/.ci/scripts/test_llama.sh
@@ -51,6 +51,9 @@ UPLOAD_DIR="${UPLOAD_DIR:-}"
 # Default PT2E_QUANTIZE to empty string if not set
 PT2E_QUANTIZE="${PT2E_QUANTIZE:-}"
 
+# Default CMake Build Type to release mode
+CMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE:-Release}
+
 if [[ $# -lt 4 ]]; then # Assuming 4 mandatory args
     echo "Expecting atleast 4 positional arguments"
     echo "Usage: [...]"
@@ -143,7 +146,7 @@ cmake_install_executorch_libraries() {
     rm -rf cmake-out
     retry cmake \
         -DCMAKE_INSTALL_PREFIX=cmake-out \
-        -DCMAKE_BUILD_TYPE=Debug \
+        -DCMAKE_BUILD_TYPE="$CMAKE_BUILD_TYPE" \
         -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
         -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
         -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
@@ -157,7 +160,7 @@ cmake_install_executorch_libraries() {
         -DQNN_SDK_ROOT="$QNN_SDK_ROOT" \
         -DPYTHON_EXECUTABLE="$PYTHON_EXECUTABLE" \
         -Bcmake-out .
-    cmake --build cmake-out -j9 --target install --config Debug
+    cmake --build cmake-out -j9 --target install --config "$CMAKE_BUILD_TYPE"
 }
 
 cmake_build_llama_runner() {
@@ -165,14 +168,14 @@ cmake_build_llama_runner() {
     dir="examples/models/llama"
     retry cmake \
         -DCMAKE_INSTALL_PREFIX=cmake-out \
-        -DCMAKE_BUILD_TYPE=Debug \
+        -DCMAKE_BUILD_TYPE="$CMAKE_BUILD_TYPE" \
         -DEXECUTORCH_BUILD_KERNELS_CUSTOM="$CUSTOM" \
         -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
         -DEXECUTORCH_BUILD_XNNPACK="$XNNPACK" \
         -DPYTHON_EXECUTABLE="$PYTHON_EXECUTABLE" \
         -Bcmake-out/${dir} \
         ${dir}
-    cmake --build cmake-out/${dir} -j9 --config Debug
+    cmake --build cmake-out/${dir} -j9 --config "$CMAKE_BUILD_TYPE"
 
 }
 
diff --git a/.ci/scripts/test_llava.sh b/.ci/scripts/test_llava.sh
index 1057fa8f4a..a30143d895 100644
--- a/.ci/scripts/test_llava.sh
+++ b/.ci/scripts/test_llava.sh
@@ -8,11 +8,11 @@
 set -exu
 # shellcheck source=/dev/null
 
-BUILD_TYPE=${1:-Debug}
 TARGET_OS=${2:-Native}
 BUILD_DIR=${3:-cmake-out}
+CMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE:-Release}
 
-echo "Building with BUILD_TYPE: $BUILD_TYPE, TARGET_OS: $TARGET_OS, BUILD_DIR: $BUILD_DIR"
+echo "Building with CMAKE_BUILD_TYPE: $CMAKE_BUILD_TYPE, TARGET_OS: $TARGET_OS, BUILD_DIR: $BUILD_DIR"
 
 if [[ -z "${PYTHON_EXECUTABLE:-}" ]]; then
     PYTHON_EXECUTABLE=python3
@@ -32,7 +32,7 @@ if hash nproc &> /dev/null; then NPROC=$(nproc); fi
 
 EXECUTORCH_COMMON_CMAKE_ARGS="                      \
         -DCMAKE_INSTALL_PREFIX=${BUILD_DIR}         \
-        -DCMAKE_BUILD_TYPE=${BUILD_TYPE}            \
+        -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}            \
         -DEXECUTORCH_ENABLE_LOGGING=ON              \
         -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON      \
         -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
@@ -49,7 +49,7 @@ cmake_install_executorch_libraries() {
         ${EXECUTORCH_COMMON_CMAKE_ARGS} \
         -B${BUILD_DIR} .
 
-    cmake --build ${BUILD_DIR} -j${NPROC} --target install --config ${BUILD_TYPE}
+    cmake --build ${BUILD_DIR} -j${NPROC} --target install --config ${CMAKE_BUILD_TYPE}
 }
 
 cmake_install_executorch_libraries_for_android() {
@@ -59,14 +59,14 @@ cmake_install_executorch_libraries_for_android() {
         ${EXECUTORCH_COMMON_CMAKE_ARGS}                                         \
         -B${BUILD_DIR} .
 
-    cmake --build ${BUILD_DIR} -j${NPROC} --target install --config ${BUILD_TYPE}
+    cmake --build ${BUILD_DIR} -j${NPROC} --target install --config ${CMAKE_BUILD_TYPE}
 }
 
 
 LLAVA_COMMON_CMAKE_ARGS="                        \
         -DPYTHON_EXECUTABLE="$PYTHON_EXECUTABLE" \
         -DCMAKE_INSTALL_PREFIX=${BUILD_DIR}      \
-        -DCMAKE_BUILD_TYPE=${BUILD_TYPE}         \
+        -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}         \
         -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON     \
         -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON  \
         -DEXECUTORCH_BUILD_XNNPACK=ON"
@@ -81,7 +81,7 @@ cmake_build_llava_runner() {
         -B${BUILD_DIR}/${dir}             \
         ${dir}
 
-    cmake --build ${BUILD_DIR}/${dir} -j${NPROC} --config ${BUILD_TYPE}
+    cmake --build ${BUILD_DIR}/${dir} -j${NPROC} --config ${CMAKE_BUILD_TYPE}
 }
 
 
@@ -98,7 +98,7 @@ cmake_build_llava_runner_for_android() {
         -B${BUILD_DIR}/${dir}                                                   \
         ${dir}
 
-    cmake --build ${BUILD_DIR}/${dir} -j${NPROC} --config ${BUILD_TYPE}
+    cmake --build ${BUILD_DIR}/${dir} -j${NPROC} --config ${CMAKE_BUILD_TYPE}
 }
 
 # only export the one without custom op for now since it's
diff --git a/.github/workflows/trunk.yml b/.github/workflows/trunk.yml
index c5d33038e8..18c91691e9 100644
--- a/.github/workflows/trunk.yml
+++ b/.github/workflows/trunk.yml
@@ -290,7 +290,7 @@ jobs:
   #       ${CONDA_RUN} python -m unittest examples.models.llava.test.test_llava
 
   #       # run e2e (export, tokenizer and runner)
-  #       PYTHON_EXECUTABLE=python ${CONDA_RUN} bash .ci/scripts/test_llava.sh Release
+  #       PYTHON_EXECUTABLE=python ${CONDA_RUN} bash .ci/scripts/test_llava.sh
 
   test-qnn-model:
     name: test-qnn-model
diff --git a/backends/xnnpack/third-party/XNNPACK b/backends/xnnpack/third-party/XNNPACK
index d5d572e46e..4ea82e595b 160000
--- a/backends/xnnpack/third-party/XNNPACK
+++ b/backends/xnnpack/third-party/XNNPACK
@@ -1 +1 @@
-Subproject commit d5d572e46ed3929fa3e67f6174192893943cf724
+Subproject commit 4ea82e595b36106653175dcb04b2aa532660d0d8
diff --git a/backends/xnnpack/third-party/xnnpack.buck.bzl b/backends/xnnpack/third-party/xnnpack.buck.bzl
index d2068661fe..6ce0316010 100644
--- a/backends/xnnpack/third-party/xnnpack.buck.bzl
+++ b/backends/xnnpack/third-party/xnnpack.buck.bzl
@@ -42,7 +42,7 @@ def define_xnnpack():
             "XNNPACK/src/mutex.c",
             "XNNPACK/src/normalization.c",
             "XNNPACK/src/operator-utils.c",
-            "XNNPACK/src/packing.cc",
+            "XNNPACK/src/reference/packing.cc",
         ],
         headers = get_xnnpack_headers(),
         header_namespace = "",
@@ -67,7 +67,7 @@ def define_xnnpack():
     # @lint-ignore BUCKLINT: native and fb_native are explicitly forbidden in fbcode.
     native.cxx_library(
         name = "subgraph",
-        srcs = SUBGRAPH_SRCS,
+        srcs = SUBGRAPH_SRCS + ["XNNPACK/src/datatype.c"],
         compiler_flags = [
             "-Wno-error=missing-braces",  # required since the SGX toolchain does not have this by default
         ],
@@ -1076,6 +1076,8 @@ def define_xnnpack():
             "XNNPACK/src/configs/hardware-config.c",
             "XNNPACK/src/microparams-init.c",
             "XNNPACK/src/microkernel-utils.c",
+            "XNNPACK/src/reference/binary-elementwise.cc",
+            "XNNPACK/src/reference/unary-elementwise.cc",
         ],
         headers = get_xnnpack_headers(),
         exported_headers = {
diff --git a/backends/xnnpack/third-party/xnnpack_src_defs.bzl b/backends/xnnpack/third-party/xnnpack_src_defs.bzl
index 038b90acab..8cb9affede 100644
--- a/backends/xnnpack/third-party/xnnpack_src_defs.bzl
+++ b/backends/xnnpack/third-party/xnnpack_src_defs.bzl
@@ -17,24 +17,14 @@ def prod_srcs_for_arch_wrapper(arch):
     return define_xnnpack_build_src(prod_srcs)
 
 def get_xnnpack_headers():
-    # XNNPACK Headers in the path containing xnnpack/ or configs/
-    # do not contain the src/ path. However headers not in xnnpack/ or
-    # configs/ are prepend with the src/ path. This function helps us
-    # to correctly parse all the header files to the correct name
     src_headers = subdir_glob([
         ("XNNPACK/src", "**/*.h"),
     ])
-    fixed_headers = {}
-    for k, v in src_headers.items():
-        new_key = k
-        if not k.startswith("xnnpack") and not k.startswith("configs"):
-            new_key = "src/{}".format(k)
-        fixed_headers[new_key] = v
     include_headers = subdir_glob([
         ("XNNPACK/include", "*.h"),
     ])
 
-    return fixed_headers | include_headers
+    return src_headers | include_headers
 
 OPERATOR_SRCS = define_xnnpack_build_src(_OPERATOR_SRCS)
 SUBGRAPH_SRCS = define_xnnpack_build_src(_SUBGRAPH_SRCS)

From a8fa8574469e4aa06983b8695a7ded2182808d17 Mon Sep 17 00:00:00 2001
From: Erik Lundell <erik.lundell@arm.com>
Date: Fri, 15 Nov 2024 15:39:55 +0100
Subject: [PATCH 06/27] Add FVP testing to ops

Add expected fails accordingly

Signed-off-by: Erik Lundell <erik.lundell@arm.com>
Change-Id: Ic76626256ae4c53258536ffa747a7ee02832b168
---
 backends/arm/test/ops/test_avg_pool.py       | 13 +++++---
 backends/arm/test/ops/test_bmm.py            | 22 ++++++++++++--
 backends/arm/test/ops/test_cat.py            | 10 +++++-
 backends/arm/test/ops/test_clone.py          |  5 ++-
 backends/arm/test/ops/test_conv1d.py         |  8 ++++-
 backends/arm/test/ops/test_conv2d.py         |  9 +++++-
 backends/arm/test/ops/test_conv_combos.py    |  7 ++++-
 backends/arm/test/ops/test_depthwise_conv.py | 31 +++++++++++++++----
 backends/arm/test/ops/test_div.py            | 30 +++++++++++++++---
 backends/arm/test/ops/test_exp.py            |  7 +++--
 backends/arm/test/ops/test_expand.py         |  9 +++++-
 backends/arm/test/ops/test_full.py           |  9 +++++-
 backends/arm/test/ops/test_hardtanh.py       | 21 ++++++++++---
 backends/arm/test/ops/test_layer_norm.py     | 23 ++++++++++++--
 backends/arm/test/ops/test_log.py            |  5 ++-
 backends/arm/test/ops/test_mul.py            | 32 ++++++++++++--------
 backends/arm/test/ops/test_permute.py        | 16 ++++++++--
 backends/arm/test/ops/test_reciprocal.py     | 19 ++++++------
 backends/arm/test/ops/test_sub.py            |  7 +++--
 19 files changed, 223 insertions(+), 60 deletions(-)

diff --git a/backends/arm/test/ops/test_avg_pool.py b/backends/arm/test/ops/test_avg_pool.py
index afd079fb95..ad3ddf8c0a 100644
--- a/backends/arm/test/ops/test_avg_pool.py
+++ b/backends/arm/test/ops/test_avg_pool.py
@@ -23,10 +23,10 @@
 
 test_data_suite = [
     # (test_name, test_data, [kernel_size, stride, padding])
-    ("zeros", torch.zeros(20, 16, 50, 32), [4, 2, 0]),
-    ("ones", torch.zeros(20, 16, 50, 32), [4, 2, 0]),
-    ("rand", torch.rand(20, 16, 50, 32), [4, 2, 0]),
-    ("randn", torch.randn(20, 16, 50, 32), [4, 2, 0]),
+    ("zeros", torch.zeros(1, 16, 50, 32), [4, 2, 0]),
+    ("ones", torch.zeros(1, 16, 50, 32), [4, 2, 0]),
+    ("rand", torch.rand(1, 16, 50, 32), [4, 2, 0]),
+    ("randn", torch.randn(1, 16, 50, 32), [4, 2, 0]),
 ]
 
 
@@ -101,7 +101,7 @@ def _test_avgpool2d_tosa_ethos_BI_pipeline(
         test_data: Tuple[torch.tensor],
     ):
         quantizer = ArmQuantizer().set_io(get_symmetric_quantization_config())
-        (
+        tester = (
             ArmTester(
                 module,
                 example_inputs=test_data,
@@ -116,7 +116,10 @@ def _test_avgpool2d_tosa_ethos_BI_pipeline(
             .check_not(["executorch_exir_dialects_edge__ops_aten_avg_pool2d_default"])
             .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
             .to_executorch()
+            .serialize()
         )
+        if common.is_option_enabled("corstone300"):
+            tester.run_method_and_compare_outputs(qtol=1, inputs=test_data)
 
     @parameterized.expand(test_data_suite)
     def test_avgpool2d_tosa_MI(
diff --git a/backends/arm/test/ops/test_bmm.py b/backends/arm/test/ops/test_bmm.py
index 6246657120..824ec46372 100644
--- a/backends/arm/test/ops/test_bmm.py
+++ b/backends/arm/test/ops/test_bmm.py
@@ -41,7 +41,7 @@ def forward(self, x, y):
     class BMMSingleInput(torch.nn.Module):
         test_parameters = [
             (torch.rand(20, 3, 3),),
-            (torch.ones(2, 128, 128),),
+            (torch.rand(2, 128, 128),),
             (10000 * torch.randn(4, 25, 25),),
             (5 + 5 * torch.randn(3, 64, 64),),
         ]
@@ -96,7 +96,7 @@ def _test_bmm_ethosu_BI_pipeline(
         compile_spec: CompileSpec,
         test_data: Tuple[torch.Tensor, ...],
     ):
-        (
+        tester = (
             ArmTester(
                 module,
                 example_inputs=test_data,
@@ -110,7 +110,10 @@ def _test_bmm_ethosu_BI_pipeline(
             .partition()
             .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
             .to_executorch()
+            .serialize()
         )
+        if common.is_option_enabled("corstone300"):
+            tester.run_method_and_compare_outputs(inputs=test_data, qtol=1)
 
     @parameterized.expand(BMM.test_parameters)
     def test_bmm_tosa_MI(self, operand1: torch.Tensor, operand2: torch.Tensor):
@@ -143,9 +146,20 @@ def test_bmm_single_input_tosa_BI(self, operand1: torch.Tensor):
         self._test_bmm_tosa_BI_pipeline(self.BMMSingleInput(), test_data)
 
     @parameterized.expand(BMM.test_parameters)
+    @unittest.expectedFailure
     def test_bmm_u55_BI(self, operand1: torch.Tensor, operand2: torch.Tensor):
         test_data = (operand1, operand2)
-        self._test_bmm_tosa_BI_pipeline(self.BMM(), test_data)
+        self._test_bmm_ethosu_BI_pipeline(
+            self.BMM(), common.get_u55_compile_spec(), test_data
+        )
+
+    @parameterized.expand(BMM.test_parameters)
+    @common.expectedFailureOnFVP
+    def test_bmm_u85_BI(self, operand1: torch.Tensor, operand2: torch.Tensor):
+        test_data = (operand1, operand2)
+        self._test_bmm_ethosu_BI_pipeline(
+            self.BMM(), common.get_u85_compile_spec(), test_data
+        )
 
     # Expected to fail with error: Warning, unsupported fusing of TOSA Rescale previous operator is of type: Memcpy
     @parameterized.expand(BMMSingleInput.test_parameters)
@@ -156,7 +170,9 @@ def test_bmm_single_input_u55_BI(self, operand1: torch.Tensor):
             self.BMMSingleInput(), common.get_u55_compile_spec(), test_data
         )
 
+    # Numerical issues on FVP, MLETORCH 534
     @parameterized.expand(BMMSingleInput.test_parameters)
+    @common.expectedFailureOnFVP
     def test_bmm_single_input_u85_BI(self, operand1: torch.Tensor):
         test_data = (operand1,)
         self._test_bmm_ethosu_BI_pipeline(
diff --git a/backends/arm/test/ops/test_cat.py b/backends/arm/test/ops/test_cat.py
index b380c44d52..88846369d0 100644
--- a/backends/arm/test/ops/test_cat.py
+++ b/backends/arm/test/ops/test_cat.py
@@ -96,7 +96,7 @@ def _test_cat_ethosu_BI_pipeline(
         compile_spec: CompileSpec,
         test_data: Tuple[tuple[torch.Tensor, ...], int],
     ):
-        (
+        tester = (
             ArmTester(
                 module,
                 example_inputs=test_data,
@@ -108,10 +108,14 @@ def _test_cat_ethosu_BI_pipeline(
             .check(["torch.ops.quantized_decomposed"])
             .to_edge()
             .partition()
+            .dump_artifact()
             .check_not(["executorch_exir_dialects_edge__ops_aten_cat_default"])
             .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
             .to_executorch()
+            .serialize()
         )
+        if common.is_option_enabled("corstone300"):
+            tester.run_method_and_compare_outputs(inputs=test_data)
 
     @parameterized.expand(Cat.test_parameters)
     def test_cat_tosa_MI(self, operands: tuple[torch.Tensor, ...], dim: int):
@@ -129,14 +133,18 @@ def test_cat_tosa_BI(self, operands: tuple[torch.Tensor, ...], dim: int):
         test_data = (operands, dim)
         self._test_cat_tosa_BI_pipeline(self.Cat(), test_data)
 
+    # Mismatch in provided number of inputs and model signature, MLETORCH 519
     @parameterized.expand(Cat.test_parameters)
+    @common.expectedFailureOnFVP
     def test_cat_u55_BI(self, operands: tuple[torch.Tensor, ...], dim: int):
         test_data = (operands, dim)
         self._test_cat_ethosu_BI_pipeline(
             self.Cat(), common.get_u55_compile_spec(), test_data
         )
 
+    # Mismatch in provided number of inputs and model signature, MLETORCH 519
     @parameterized.expand(Cat.test_parameters)
+    @common.expectedFailureOnFVP
     def test_cat_u85_BI(self, operands: tuple[torch.Tensor, ...], dim: int):
         test_data = (operands, dim)
         self._test_cat_ethosu_BI_pipeline(
diff --git a/backends/arm/test/ops/test_clone.py b/backends/arm/test/ops/test_clone.py
index 4721f257b0..6b5216a8e1 100644
--- a/backends/arm/test/ops/test_clone.py
+++ b/backends/arm/test/ops/test_clone.py
@@ -85,7 +85,7 @@ def _test_clone_tosa_ethos_pipeline(
         test_data: Tuple[torch.Tensor],
     ):
         quantizer = ArmQuantizer().set_io(get_symmetric_quantization_config())
-        (
+        tester = (
             ArmTester(module, example_inputs=test_data, compile_spec=compile_spec)
             .quantize(Quantize(quantizer, get_symmetric_quantization_config()))
             .export()
@@ -94,7 +94,10 @@ def _test_clone_tosa_ethos_pipeline(
             .partition()
             .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
             .to_executorch()
+            .serialize()
         )
+        if common.is_option_enabled("corstone300"):
+            tester.run_method_and_compare_outputs(qtol=1, inputs=test_data)
 
     def _test_clone_tosa_u55_pipeline(
         self, module: torch.nn.Module, test_data: Tuple[torch.Tensor]
diff --git a/backends/arm/test/ops/test_conv1d.py b/backends/arm/test/ops/test_conv1d.py
index 133148faef..f00c7984a1 100644
--- a/backends/arm/test/ops/test_conv1d.py
+++ b/backends/arm/test/ops/test_conv1d.py
@@ -268,7 +268,7 @@ def _test_conv1d_ethosu_BI_pipeline(
         compile_spec: CompileSpec,
         test_data: Tuple[torch.Tensor],
     ):
-        (
+        tester = (
             ArmTester(module, example_inputs=test_data, compile_spec=compile_spec)
             .quantize()
             .export()
@@ -277,7 +277,10 @@ def _test_conv1d_ethosu_BI_pipeline(
             .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
             .check_not(["executorch_exir_dialects_edge__ops_aten_convolution_default"])
             .to_executorch()
+            .serialize()
         )
+        if common.is_option_enabled("corstone300"):
+            tester.run_method_and_compare_outputs(qtol=1, inputs=test_data)
 
     @parameterized.expand(testsuite)
     def test_conv1d_tosa_MI(self, test_name, model):
@@ -295,6 +298,9 @@ def test_conv1d_u55_BI(self, test_name, model):
             model, common.get_u55_compile_spec(), model.get_inputs()
         )
 
+    # This specific test case has numerical errors on FVP, MLETORCH-520.
+    testsuite.remove(("5_3x2x128_st1", conv1d_5_3x2x128_st1))
+
     @parameterized.expand(testsuite)
     def test_conv1d_u85_BI(self, test_name, model):
         self._test_conv1d_ethosu_BI_pipeline(
diff --git a/backends/arm/test/ops/test_conv2d.py b/backends/arm/test/ops/test_conv2d.py
index 43c3e85139..21df4bf0d5 100644
--- a/backends/arm/test/ops/test_conv2d.py
+++ b/backends/arm/test/ops/test_conv2d.py
@@ -295,7 +295,7 @@ def _test_conv2d_ethosu_BI_pipeline(
         module: torch.nn.Module,
         test_data: Tuple[torch.Tensor],
     ):
-        (
+        tester = (
             ArmTester(
                 module,
                 example_inputs=test_data,
@@ -308,7 +308,10 @@ def _test_conv2d_ethosu_BI_pipeline(
             .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
             .check_not(["executorch_exir_dialects_edge__ops_aten_convolution_default"])
             .to_executorch()
+            .serialize()
         )
+        if common.is_option_enabled("corstone300"):
+            tester.run_method_and_compare_outputs(qtol=1, inputs=test_data)
 
     @parameterized.expand(testsuite)
     def test_conv2d_tosa_MI(self, test_name, model):
@@ -318,6 +321,10 @@ def test_conv2d_tosa_MI(self, test_name, model):
     def test_conv2d_tosa_BI(self, test_name, model):
         self._test_conv2d_tosa_BI_pipeline(model, model.get_inputs())
 
+    # These cases have numerical issues on FVP, MLETORCH-520
+    testsuite.remove(("2x2_3x2x40x40_nobias", conv2d_2x2_3x2x40x40_nobias))
+    testsuite.remove(("5x5_3x2x128x128_st1", conv2d_5x5_3x2x128x128_st1))
+
     @parameterized.expand(testsuite)
     def test_conv2d_u55_BI(self, test_name, model):
         self._test_conv2d_ethosu_BI_pipeline(
diff --git a/backends/arm/test/ops/test_conv_combos.py b/backends/arm/test/ops/test_conv_combos.py
index 3e9bdef958..7555fff720 100644
--- a/backends/arm/test/ops/test_conv_combos.py
+++ b/backends/arm/test/ops/test_conv_combos.py
@@ -238,7 +238,7 @@ def _test_conv_combo_ethos_BI_pipeline(
         compile_spec: CompileSpec,
         test_data: Tuple[torch.Tensor],
     ):
-        (
+        tester = (
             ArmTester(
                 module,
                 example_inputs=test_data,
@@ -251,7 +251,10 @@ def _test_conv_combo_ethos_BI_pipeline(
             .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
             .check_not(list(module.edge_op_list))
             .to_executorch()
+            .serialize()
         )
+        if common.is_option_enabled("corstone300"):
+            tester.run_method_and_compare_outputs(qtol=1, inputs=test_data)
 
     ####################
     ## Conv + meandim ##
@@ -272,6 +275,8 @@ def test_conv_meandim_u55_BI(self):
             model.get_inputs(),
         )
 
+    # Numerical Issues on FVP, MLETORCH-520
+    @common.expectedFailureOnFVP
     def test_conv_meandim_u85_BI(self):
         model = ComboConv2dMeandim()
         self._test_conv_combo_ethos_BI_pipeline(
diff --git a/backends/arm/test/ops/test_depthwise_conv.py b/backends/arm/test/ops/test_depthwise_conv.py
index 4bfa863c49..28cb9ac844 100644
--- a/backends/arm/test/ops/test_depthwise_conv.py
+++ b/backends/arm/test/ops/test_depthwise_conv.py
@@ -8,8 +8,6 @@
 
 from typing import Tuple
 
-import pytest
-
 import torch
 from executorch.backends.arm.test import common
 from executorch.backends.arm.test.ops.test_conv1d import Conv1d
@@ -160,8 +158,8 @@
 
 testsuite_conv1d = [
     ("2_1x6x4_gp6_st1", dw_conv1d_2_1x6x4_gp6_st1),
-    ("3_1x3x256_gp3_st1", dw_conv1d_3_1x3x256_gp3_st1),
     ("two_dw_conv1d", two_dw_conv1d),
+    ("3_1x3x256_gp3_st1", dw_conv1d_3_1x3x256_gp3_st1),
     ("3_1x3x14_gp3_st1", dw_conv1d_3_1x3x14_gp3_st1),
 ]
 
@@ -217,7 +215,7 @@ def _test_dw_conv_ethos_BI_pipeline(
         compile_spec: CompileSpec,
         test_data: Tuple[torch.Tensor],
     ):
-        (
+        tester = (
             ArmTester(
                 module,
                 example_inputs=test_data,
@@ -230,7 +228,10 @@ def _test_dw_conv_ethos_BI_pipeline(
             .check_not(["executorch_exir_dialects_edge__ops_aten_convolution_default"])
             .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
             .to_executorch()
+            .serialize()
         )
+        if common.is_option_enabled("corstone300"):
+            tester.run_method_and_compare_outputs(qtol=1, inputs=test_data)
 
     @parameterized.expand(testsuite_conv1d + testsuite_conv2d)
     def test_dw_conv_tosa_MI(self, test_name: str, model: torch.nn.Module):
@@ -238,11 +239,15 @@ def test_dw_conv_tosa_MI(self, test_name: str, model: torch.nn.Module):
 
     # TODO: Investigate flakyness (MLTORCH-307)
     @parameterized.expand(testsuite_conv1d + testsuite_conv2d)
-    @pytest.mark.flaky(reruns=3)
     def test_dw_conv_tosa_BI(self, test_name: str, model: torch.nn.Module):
         self._test_dw_conv_tosa_BI_pipeline(model, model.get_inputs())
 
+    testsuite_conv2d.remove(
+        ("3x3_1x3x256x256_gp3_st1", dw_conv2d_3x3_1x3x256x256_gp3_st1)
+    )  # Works
+
     @parameterized.expand(testsuite_conv2d, skip_on_empty=True)
+    @common.expectedFailureOnFVP
     def test_dw_conv2d_u55_BI(
         self, test_name: str, model: torch.nn.Module, set_quantize_io: bool = False
     ):
@@ -269,7 +274,21 @@ def test_dw_conv1d_u55_BI(
             model.get_inputs(),
         )
 
-    @parameterized.expand(testsuite_conv1d + testsuite_conv2d)
+    # All test cases except 3x3_1x3x256x256_gp3_st1 have numerical issues on FVP. MLETORCH-520
+    @parameterized.expand(testsuite_conv1d[:-2] + testsuite_conv2d)
+    @common.expectedFailureOnFVP
+    def test_dw_conv_u85_BI_xfails(
+        self, test_name: str, model: torch.nn.Module, set_quantize_io: bool = False
+    ):
+        self._test_dw_conv_ethos_BI_pipeline(
+            model,
+            common.get_u85_compile_spec(
+                permute_memory_to_nhwc=True, quantize_io=set_quantize_io
+            ),
+            model.get_inputs(),
+        )
+
+    @parameterized.expand(testsuite_conv1d[-2:])
     def test_dw_conv_u85_BI(
         self, test_name: str, model: torch.nn.Module, set_quantize_io: bool = False
     ):
diff --git a/backends/arm/test/ops/test_div.py b/backends/arm/test/ops/test_div.py
index 28cc686690..b3815f3e7c 100644
--- a/backends/arm/test/ops/test_div.py
+++ b/backends/arm/test/ops/test_div.py
@@ -136,10 +136,10 @@ def _test_div_tosa_BI_pipeline(
             .run_method_and_compare_outputs(inputs=test_data, atol=1, rtol=0.1)
         )
 
-    def _test_div_u55_BI_pipeline(
-        self, module: torch.nn.Module, test_data: Tuple[torch.Tensor]
+    def _test_div_ethos_BI_pipeline(
+        self, module: torch.nn.Module, compile_spec, test_data: Tuple[torch.Tensor]
     ):
-        (
+        tester = (
             ArmTester(
                 module,
                 example_inputs=test_data,
@@ -155,7 +155,10 @@ def _test_div_u55_BI_pipeline(
             .partition()
             .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
             .to_executorch()
+            .serialize()
         )
+        if common.is_option_enabled("corstone300"):
+            tester.run_method_and_compare_outputs(qtol=1, inputs=test_data)
 
     @parameterized.expand(test_data_suite)
     def test_div_tosa_MI(
@@ -180,7 +183,9 @@ def test_div_tosa_BI(
         test_data = (input_, other_)
         self._test_div_tosa_BI_pipeline(self.Div(), test_data)
 
+    # Numerical issues on FVP likely due to mul op, MLETORCH-521
     @parameterized.expand(test_data_suite)
+    @common.expectedFailureOnFVP
     def test_div_u55_BI(
         self,
         test_name: str,
@@ -189,4 +194,21 @@ def test_div_u55_BI(
         rounding_mode: Optional[str] = None,
     ):
         test_data = (input_, other_)
-        self._test_div_u55_BI_pipeline(self.Div(), test_data)
+        self._test_div_ethos_BI_pipeline(
+            self.Div(), common.get_u55_compile_spec(), test_data
+        )
+
+    # Numerical issues on FVP likely due to mul op, MLETORCH-521
+    @parameterized.expand(test_data_suite)
+    @common.expectedFailureOnFVP
+    def test_div_u85_BI(
+        self,
+        test_name: str,
+        input_: Union[torch.Tensor, torch.types.Number],
+        other_: Union[torch.Tensor, torch.types.Number],
+        rounding_mode: Optional[str] = None,
+    ):
+        test_data = (input_, other_)
+        self._test_div_ethos_BI_pipeline(
+            self.Div(), common.get_u85_compile_spec(), test_data
+        )
diff --git a/backends/arm/test/ops/test_exp.py b/backends/arm/test/ops/test_exp.py
index c706b7b206..f33e0a9058 100644
--- a/backends/arm/test/ops/test_exp.py
+++ b/backends/arm/test/ops/test_exp.py
@@ -20,7 +20,7 @@
     ("zeros", torch.zeros(1, 10, 10, 10)),
     ("ones", torch.ones(10, 10, 10)),
     ("rand", torch.rand(10, 10) - 0.5),
-    ("randn_pos", torch.randn(10) + 10),
+    ("randn_pos", torch.randn(1, 4, 4, 4) + 10),
     ("randn_neg", torch.randn(10) - 10),
     ("ramp", torch.arange(-16, 16, 0.2)),
 ]
@@ -78,7 +78,7 @@ def _test_exp_ethosu_BI_pipeline(
         module: torch.nn.Module,
         test_data: Tuple[torch.tensor],
     ):
-        (
+        tester = (
             ArmTester(
                 module,
                 example_inputs=test_data,
@@ -93,7 +93,10 @@ def _test_exp_ethosu_BI_pipeline(
             .check_not(["executorch_exir_dialects_edge__ops_aten_exp_default"])
             .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
             .to_executorch()
+            .serialize()
         )
+        if common.is_option_enabled("corstone300"):
+            tester.run_method_and_compare_outputs(qtol=1, inputs=test_data)
 
     @parameterized.expand(test_data_suite)
     def test_exp_tosa_MI(
diff --git a/backends/arm/test/ops/test_expand.py b/backends/arm/test/ops/test_expand.py
index effa7ce713..27f311b546 100644
--- a/backends/arm/test/ops/test_expand.py
+++ b/backends/arm/test/ops/test_expand.py
@@ -81,7 +81,7 @@ def _test_expand_ethosu_BI_pipeline(
         self, compile_spec: CompileSpec, module: torch.nn.Module, test_data: Tuple
     ):
         quantizer = ArmQuantizer().set_io(get_symmetric_quantization_config())
-        (
+        tester = (
             ArmTester(
                 module,
                 example_inputs=test_data,
@@ -95,7 +95,10 @@ def _test_expand_ethosu_BI_pipeline(
             .check_not(["torch.ops.aten.expand.default"])
             .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
             .to_executorch()
+            .serialize()
         )
+        if common.is_option_enabled("corstone300"):
+            tester.run_method_and_compare_outputs(qtol=1, inputs=test_data)
 
     @parameterized.expand(Expand.test_parameters)
     def test_expand_tosa_MI(self, test_input, multiples):
@@ -105,13 +108,17 @@ def test_expand_tosa_MI(self, test_input, multiples):
     def test_expand_tosa_BI(self, test_input, multiples):
         self._test_expand_tosa_BI_pipeline(self.Expand(), (test_input, multiples))
 
+    # Mismatch in provided number of inputs and model signature, MLETORCH 519
     @parameterized.expand(Expand.test_parameters)
+    @common.expectedFailureOnFVP
     def test_expand_u55_BI(self, test_input, multiples):
         self._test_expand_ethosu_BI_pipeline(
             common.get_u55_compile_spec(), self.Expand(), (test_input, multiples)
         )
 
+    # Mismatch in provided number of inputs and model signature, MLETORCH 519
     @parameterized.expand(Expand.test_parameters)
+    @common.expectedFailureOnFVP
     def test_expand_u85_BI(self, test_input, multiples):
         self._test_expand_ethosu_BI_pipeline(
             common.get_u85_compile_spec(), self.Expand(), (test_input, multiples)
diff --git a/backends/arm/test/ops/test_full.py b/backends/arm/test/ops/test_full.py
index d4cfc5c369..9857a7b87b 100644
--- a/backends/arm/test/ops/test_full.py
+++ b/backends/arm/test/ops/test_full.py
@@ -97,7 +97,7 @@ def _test_full_tosa_BI_pipeline(
     def _test_full_tosa_ethos_pipeline(
         self, compile_spec: list[CompileSpec], module: torch.nn.Module, test_data: Tuple
     ):
-        (
+        tester = (
             ArmTester(module, example_inputs=test_data, compile_spec=compile_spec)
             .quantize()
             .export()
@@ -107,7 +107,10 @@ def _test_full_tosa_ethos_pipeline(
             .check_not(["executorch_exir_dialects_edge__ops_aten_full_default"])
             .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
             .to_executorch()
+            .serialize()
         )
+        if common.is_option_enabled("corstone300"):
+            tester.run_method_and_compare_outputs(qtol=1, inputs=test_data)
 
     def _test_full_tosa_u55_pipeline(self, module: torch.nn.Module, test_data: Tuple):
         self._test_full_tosa_ethos_pipeline(
@@ -140,14 +143,18 @@ def test_full_tosa_MI(self, test_tensor: Tuple):
     def test_full_tosa_BI(self, test_tensor: Tuple):
         self._test_full_tosa_BI_pipeline(self.AddVariableFull(), test_tensor, False)
 
+    # Mismatch in provided number of inputs and model signature, MLETORCH 519
     @parameterized.expand(AddVariableFull.test_parameters)
+    @common.expectedFailureOnFVP
     def test_full_u55_BI(self, test_tensor: Tuple):
         self._test_full_tosa_u55_pipeline(
             self.AddVariableFull(),
             test_tensor,
         )
 
+    # Mismatch in provided number of inputs and model signature, MLETORCH 519
     @parameterized.expand(AddVariableFull.test_parameters)
+    @common.expectedFailureOnFVP
     def test_full_u85_BI(self, test_tensor: Tuple):
         self._test_full_tosa_u85_pipeline(
             self.AddVariableFull(),
diff --git a/backends/arm/test/ops/test_hardtanh.py b/backends/arm/test/ops/test_hardtanh.py
index a9f12abdf0..10073c5095 100644
--- a/backends/arm/test/ops/test_hardtanh.py
+++ b/backends/arm/test/ops/test_hardtanh.py
@@ -87,15 +87,15 @@ def _test_hardtanh_tosa_BI_pipeline(
             .run_method_and_compare_outputs(inputs=test_data)
         )
 
-    def _test_hardtanh_tosa_u55_BI_pipeline(
-        self, module: torch.nn.Module, test_data: Tuple[torch.tensor]
+    def _test_hardtanh_tosa_ethosu_BI_pipeline(
+        self, compile_spec, module: torch.nn.Module, test_data: Tuple[torch.tensor]
     ):
         quantizer = ArmQuantizer().set_io(get_symmetric_quantization_config())
-        (
+        tester = (
             ArmTester(
                 module,
                 example_inputs=test_data,
-                compile_spec=common.get_u55_compile_spec(),
+                compile_spec=compile_spec,
             )
             .quantize(Quantize(quantizer, get_symmetric_quantization_config()))
             .export()
@@ -106,7 +106,10 @@ def _test_hardtanh_tosa_u55_BI_pipeline(
             .check_not(["executorch_exir_dialects_edge__ops_aten_hardtanh_default"])
             .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
             .to_executorch()
+            .serialize()
         )
+        if common.is_option_enabled("corstone300"):
+            tester.run_method_and_compare_outputs(qtol=1, inputs=test_data)
 
     @parameterized.expand(test_data_suite)
     def test_hardtanh_tosa_MI(
@@ -122,4 +125,12 @@ def test_hardtanh_tosa_BI(self, test_name: str, test_data: torch.Tensor):
 
     @parameterized.expand(test_data_suite)
     def test_hardtanh_tosa_u55_BI(self, test_name: str, test_data: torch.Tensor):
-        self._test_hardtanh_tosa_u55_BI_pipeline(self.HardTanh(), (test_data,))
+        self._test_hardtanh_tosa_ethosu_BI_pipeline(
+            common.get_u55_compile_spec(), self.HardTanh(), (test_data,)
+        )
+
+    @parameterized.expand(test_data_suite)
+    def test_hardtanh_tosa_u85_BI(self, test_name: str, test_data: torch.Tensor):
+        self._test_hardtanh_tosa_ethosu_BI_pipeline(
+            common.get_u85_compile_spec(), self.HardTanh(), (test_data,)
+        )
diff --git a/backends/arm/test/ops/test_layer_norm.py b/backends/arm/test/ops/test_layer_norm.py
index f059d71eba..0b06044a59 100644
--- a/backends/arm/test/ops/test_layer_norm.py
+++ b/backends/arm/test/ops/test_layer_norm.py
@@ -115,7 +115,7 @@ def _test_layernorm_ethosu_BI_pipeline(
         compile_spec: CompileSpec,
         test_data: Tuple[torch.Tensor],
     ):
-        (
+        tester = (
             ArmTester(
                 model=module,
                 example_inputs=test_data,
@@ -128,7 +128,10 @@ def _test_layernorm_ethosu_BI_pipeline(
             .partition()
             .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
             .to_executorch()
+            .serialize()
         )
+        if common.is_option_enabled("corstone300"):
+            tester.run_method_and_compare_outputs(qtol=1, inputs=test_data)
 
     @parameterized.expand(test_data_suite)
     def test_layer_norm_tosa_MI(
@@ -152,8 +155,10 @@ def test_layer_norm_tosa_BI(
             self.LayerNorm(*model_params), (test_data,)
         )
 
+    # Numerical issues on FVP likely due to mul op, MLETORCH-521
     # Skip tests that require transposes.
     @parameterized.expand(test_data_suite[:-2])
+    @common.expectedFailureOnFVP
     def test_layer_norm_u55_BI(
         self,
         test_name: str,
@@ -164,7 +169,21 @@ def test_layer_norm_u55_BI(
             self.LayerNorm(*model_params), common.get_u55_compile_spec(), (test_data,)
         )
 
-    @parameterized.expand(test_data_suite)
+    # Numerical issues on FVP likely due to mul op, MLETORCH-521
+    @parameterized.expand(test_data_suite[:-1])
+    @common.expectedFailureOnFVP
+    def test_layer_norm_u85_BI_fvp_xfails(
+        self,
+        test_name: str,
+        test_data: torch.Tensor,
+        model_params,
+    ):
+        self._test_layernorm_ethosu_BI_pipeline(
+            self.LayerNorm(*model_params), common.get_u85_compile_spec(), (test_data,)
+        )
+
+    @parameterized.expand(test_data_suite[-1:])
+    @unittest.skip  # Flaky
     def test_layer_norm_u85_BI(
         self,
         test_name: str,
diff --git a/backends/arm/test/ops/test_log.py b/backends/arm/test/ops/test_log.py
index 847635ea36..10175d27fb 100644
--- a/backends/arm/test/ops/test_log.py
+++ b/backends/arm/test/ops/test_log.py
@@ -78,7 +78,7 @@ def _test_log_ethosu_BI_pipeline(
         module: torch.nn.Module,
         test_data: Tuple[torch.tensor],
     ):
-        (
+        tester = (
             ArmTester(
                 module,
                 example_inputs=test_data,
@@ -93,7 +93,10 @@ def _test_log_ethosu_BI_pipeline(
             .check_not(["executorch_exir_dialects_edge__ops_aten_log_default"])
             .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
             .to_executorch()
+            .serialize()
         )
+        if common.is_option_enabled("corstone300"):
+            tester.run_method_and_compare_outputs(qtol=1, inputs=test_data)
 
     @parameterized.expand(test_data_suite)
     def test_log_tosa_MI(
diff --git a/backends/arm/test/ops/test_mul.py b/backends/arm/test/ops/test_mul.py
index 7fa20c2566..8f0321ea5f 100644
--- a/backends/arm/test/ops/test_mul.py
+++ b/backends/arm/test/ops/test_mul.py
@@ -16,9 +16,9 @@
 test_data_sute = [
     # (test_name, input, other,) See torch.mul() for info
     (
-        "op_mul_rank1_ones",
-        torch.ones(5),
-        torch.ones(5),
+        "op_mul_rank1_rand",
+        torch.rand(5) * 3.7,
+        torch.rand(5) * 1.5,
     ),
     (
         "op_mul_rank2_rand",
@@ -32,23 +32,23 @@
     ),
     (
         "op_mul_rank4_randn",
-        torch.randn(5, 10, 25, 20),
-        torch.randn(5, 10, 25, 20),
+        torch.randn(1, 10, 25, 20),
+        torch.randn(1, 10, 25, 20),
     ),
     (
         "op_mul_rank4_ones_mul_negative",
         torch.ones(1, 10, 25, 20),
-        (-1) * torch.ones(5, 10, 25, 20),
+        (-1) * torch.ones(1, 10, 25, 20),
     ),
     (
         "op_mul_rank4_negative_large_rand",
-        (-200) * torch.rand(5, 10, 25, 20),
-        torch.rand(5, 1, 1, 20),
+        (-200) * torch.rand(1, 10, 25, 20),
+        torch.rand(1, 1, 1, 20),
     ),
     (
         "op_mul_rank4_large_randn",
-        200 * torch.randn(5, 10, 25, 20),
-        torch.rand(5, 10, 25, 1),
+        200 * torch.randn(1, 10, 25, 20),
+        torch.rand(1, 10, 25, 1),
     ),
 ]
 
@@ -112,7 +112,7 @@ def _test_mul_ethosu_BI_pipeline(
         module: torch.nn.Module,
         test_data: tuple[torch.Tensor, torch.Tensor],
     ):
-        (
+        tester = (
             ArmTester(
                 module,
                 example_inputs=test_data,
@@ -126,7 +126,10 @@ def _test_mul_ethosu_BI_pipeline(
             .partition()
             .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
             .to_executorch()
+            .serialize()
         )
+        if common.is_option_enabled("corstone300"):
+            tester.run_method_and_compare_outputs(qtol=1, inputs=test_data)
 
     @parameterized.expand(test_data_sute)
     def test_mul_tosa_MI(
@@ -149,7 +152,9 @@ def test_mul_tosa_BI(
         test_data = (input_, other_)
         self._test_mul_tosa_BI_pipeline(self.Mul(), test_data)
 
+    # Numerical issues on FVP, MLETORCH-521
     @parameterized.expand(test_data_sute)
+    @common.expectedFailureOnFVP
     def test_mul_u55_BI(
         self,
         test_name: str,
@@ -161,7 +166,10 @@ def test_mul_u55_BI(
             common.get_u55_compile_spec(), self.Mul(), test_data
         )
 
-    @parameterized.expand(test_data_sute)
+    # Numerical issues on FVP, MLETORCH-521
+    # test_data_sute[0] works on U85
+    @parameterized.expand(test_data_sute[1:])
+    @common.expectedFailureOnFVP
     def test_mul_u85_BI(
         self,
         test_name: str,
diff --git a/backends/arm/test/ops/test_permute.py b/backends/arm/test/ops/test_permute.py
index 62b6b823de..92400215b7 100644
--- a/backends/arm/test/ops/test_permute.py
+++ b/backends/arm/test/ops/test_permute.py
@@ -100,7 +100,7 @@ def _test_permute_ethos_BI_pipeline(
         test_data: Tuple[torch.Tensor],
     ):
         quantizer = ArmQuantizer().set_io(get_symmetric_quantization_config())
-        (
+        tester = (
             ArmTester(
                 module,
                 example_inputs=test_data,
@@ -117,6 +117,8 @@ def _test_permute_ethos_BI_pipeline(
             .to_executorch()
             .serialize()
         )
+        if common.is_option_enabled("corstone300"):
+            tester.run_method_and_compare_outputs(qtol=1, inputs=test_data)
 
     @parameterized.expand(test_data_suite)
     def test_permute_tosa_MI(
@@ -143,10 +145,20 @@ def test_permute_u55_BI(
             self.Permute(dims=dims), common.get_u55_compile_spec(), (test_data,)
         )
 
-    @parameterized.expand(test_data_suite)
+    @parameterized.expand(test_data_suite[:-2])
     def test_permute_u85_BI(
         self, test_name: str, test_data: torch.Tensor, dims: list[int]
     ):
         self._test_permute_ethos_BI_pipeline(
             self.Permute(dims=dims), common.get_u85_compile_spec(), (test_data,)
         )
+
+    # Fails since on FVP since N > 1 is not supported. MLETORCH-517
+    @parameterized.expand(test_data_suite[-2:])
+    @common.expectedFailureOnFVP
+    def test_permute_u85_BI_xfails(
+        self, test_name: str, test_data: torch.Tensor, dims: list[int]
+    ):
+        self._test_permute_ethos_BI_pipeline(
+            self.Permute(dims=dims), common.get_u85_compile_spec(), (test_data,)
+        )
diff --git a/backends/arm/test/ops/test_reciprocal.py b/backends/arm/test/ops/test_reciprocal.py
index 7745a614e6..876f063c76 100644
--- a/backends/arm/test/ops/test_reciprocal.py
+++ b/backends/arm/test/ops/test_reciprocal.py
@@ -22,12 +22,12 @@
         torch.rand(5) * 5,
     ),
     ("op_reciprocal_rank1_negative_ones", torch.ones(5) * (-1)),
-    ("op_reciprocal_rank4_ones", torch.ones(5, 10, 25, 20)),
-    ("op_reciprocal_rank4_negative_ones", (-1) * torch.ones(5, 10, 25, 20)),
-    ("op_reciprocal_rank4_ones_reciprocal_negative", torch.ones(5, 10, 25, 20)),
-    ("op_reciprocal_rank4_large_rand", 200 * torch.rand(5, 10, 25, 20)),
-    ("op_reciprocal_rank4_negative_large_rand", (-200) * torch.rand(5, 10, 25, 20)),
-    ("op_reciprocal_rank4_large_randn", 200 * torch.randn(5, 10, 25, 20) + 1),
+    ("op_reciprocal_rank4_ones", torch.ones(1, 10, 25, 20)),
+    ("op_reciprocal_rank4_negative_ones", (-1) * torch.ones(1, 10, 25, 20)),
+    ("op_reciprocal_rank4_ones_reciprocal_negative", torch.ones(1, 10, 25, 20)),
+    ("op_reciprocal_rank4_large_rand", 200 * torch.rand(1, 10, 25, 20)),
+    ("op_reciprocal_rank4_negative_large_rand", (-200) * torch.rand(1, 10, 25, 20)),
+    ("op_reciprocal_rank4_large_randn", 200 * torch.randn(1, 10, 25, 20) + 1),
 ]
 
 
@@ -81,7 +81,7 @@ def _test_reciprocal_tosa_BI_pipeline(
     def _test_reciprocal_u55_BI_pipeline(
         self, module: torch.nn.Module, test_data: tuple[torch.Tensor]
     ):
-        (
+        tester = (
             ArmTester(
                 module,
                 example_inputs=test_data,
@@ -95,15 +95,16 @@ def _test_reciprocal_u55_BI_pipeline(
             .partition()
             .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
             .to_executorch()
+            .serialize()
         )
+        if common.is_option_enabled("corstone300"):
+            tester.run_method_and_compare_outputs(qtol=1, inputs=test_data)
 
     @parameterized.expand(test_data_suite)
     def test_reciprocal_tosa_MI(self, test_name: str, input_: torch.Tensor):
         test_data = (input_,)
         self._test_reciprocal_tosa_MI_pipeline(self.Reciprocal(), test_data)
 
-    # Expected to fail since ArmQuantizer cannot quantize a Reciprocal layer
-    # TODO(MLETORCH-129)
     @parameterized.expand(test_data_suite)
     def test_reciprocal_tosa_BI(self, test_name: str, input_: torch.Tensor):
 
diff --git a/backends/arm/test/ops/test_sub.py b/backends/arm/test/ops/test_sub.py
index 5c67240e52..327a8de994 100644
--- a/backends/arm/test/ops/test_sub.py
+++ b/backends/arm/test/ops/test_sub.py
@@ -17,7 +17,7 @@
 from parameterized import parameterized
 
 
-class TestSimpleSub(unittest.TestCase):
+class TestSub(unittest.TestCase):
     class Sub(torch.nn.Module):
         test_parameters = [
             (torch.ones(5),),
@@ -82,7 +82,7 @@ def _test_sub_ethosu_BI_pipeline(
         module: torch.nn.Module,
         test_data: Tuple[torch.Tensor],
     ):
-        (
+        tester = (
             ArmTester(
                 module,
                 example_inputs=test_data,
@@ -96,7 +96,10 @@ def _test_sub_ethosu_BI_pipeline(
             .partition()
             .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
             .to_executorch()
+            .serialize()
         )
+        if common.is_option_enabled("corstone300"):
+            tester.run_method_and_compare_outputs(qtol=1, inputs=test_data)
 
     @parameterized.expand(Sub.test_parameters)
     def test_sub_tosa_MI(self, test_data: torch.Tensor):

From f8bc7747cde15c7297f06d637991649112ef12c6 Mon Sep 17 00:00:00 2001
From: cad-audio <86048415+cad-audio@users.noreply.github.com>
Date: Wed, 27 Nov 2024 07:57:45 -0800
Subject: [PATCH 07/27] HiFi optimizations for mean, where, min, max, pow, rem
 and quantized_linear operators.  (#6867)

* Adding mean and where ops optimized on HiFi

* Adding quantized linear optimized versions for int8 and uint8

* adding pow, remainder, minimum, maximum operators (#33)

* adding pow, remainder, minimum, maximum operators

* adding pow, remainder, minimum, maximum operators

* Fix for build issue faced in div_mod on old tools

* Fix build failure due to merge issue

* Fixing review comments on PR 6867

---------

Co-authored-by: dijopaul <dijopaul@cadence.com>
Co-authored-by: nishpoonia <94543206+nishpoonia@users.noreply.github.com>
Co-authored-by: mcremon-meta <134334895+mcremon-meta@users.noreply.github.com>
---
 backends/cadence/aot/functions_hifi.yaml      |   32 +-
 backends/cadence/hifi/kernels/CMakeLists.txt  |    3 +
 backends/cadence/hifi/kernels/kernels.h       |   42 +
 .../cadence/hifi/operators/CMakeLists.txt     |    4 +
 .../cadence/hifi/operators/op_maximum.cpp     |  175 +++
 .../cadence/hifi/operators/op_minimum.cpp     |  173 +++
 backends/cadence/hifi/operators/op_pow.cpp    |  354 +++++
 backends/cadence/hifi/operators/op_rsqrt.cpp  |   55 +
 .../hifi/operators/quantized_linear_out.cpp   |   38 +-
 .../third-party/nnlib/xa_nn_broadcast_32.c    |  313 +++++
 .../third-party/nnlib/xa_nn_broadcast_32_32.c |  313 +++++
 .../nnlib/xa_nn_elm_minimum_maximum_f32.c     |  847 ++++++++++++
 .../third-party/nnlib/xa_nn_elm_pow_f32.c     | 1151 +++++++++++++++++
 13 files changed, 3478 insertions(+), 22 deletions(-)
 create mode 100644 backends/cadence/hifi/operators/op_maximum.cpp
 create mode 100644 backends/cadence/hifi/operators/op_minimum.cpp
 create mode 100644 backends/cadence/hifi/operators/op_pow.cpp
 create mode 100644 backends/cadence/hifi/operators/op_rsqrt.cpp
 create mode 100644 backends/cadence/hifi/third-party/nnlib/xa_nn_broadcast_32.c
 create mode 100644 backends/cadence/hifi/third-party/nnlib/xa_nn_broadcast_32_32.c
 create mode 100644 backends/cadence/hifi/third-party/nnlib/xa_nn_elm_minimum_maximum_f32.c
 create mode 100644 backends/cadence/hifi/third-party/nnlib/xa_nn_elm_pow_f32.c

diff --git a/backends/cadence/aot/functions_hifi.yaml b/backends/cadence/aot/functions_hifi.yaml
index cf234c22c0..b6a2c50001 100644
--- a/backends/cadence/aot/functions_hifi.yaml
+++ b/backends/cadence/aot/functions_hifi.yaml
@@ -77,10 +77,20 @@
     - arg_meta: null
       kernel_name: torch::executor::max_pool2d_with_indices_out
 
+- op: maximum.out
+  kernels:
+    - arg_meta: null
+      kernel_name: cadence::impl::HiFi::maximum_out
+
 - op: mean.out
   kernels:
     - arg_meta: null
-      kernel_name: cadence::impl::HiFi::mean_dim_out
+      kernel_name: cadence::impl::HiFi::mean_dim_out   
+
+- op: minimum.out
+  kernels:
+    - arg_meta: null
+      kernel_name: cadence::impl::HiFi::minimum_out
 
 - op: mul.out
   kernels:
@@ -92,6 +102,26 @@
     - arg_meta: null
       kernel_name: torch::executor::permute_copy_out
 
+- op: pow.Scalar_out
+  kernels:
+    - arg_meta: null
+      kernel_name: cadence::impl::HiFi::pow_Scalar_out
+
+- op: pow.Tensor_Scalar_out
+  kernels:
+    - arg_meta: null
+      kernel_name: cadence::impl::HiFi::pow_Tensor_Scalar_out
+
+- op: pow.Tensor_Tensor_out
+  kernels:
+    - arg_meta: null
+      kernel_name: cadence::impl::HiFi::pow_Tensor_Tensor_out
+
+- op: rsqrt.out
+  kernels:
+    - arg_meta: null
+      kernel_name: cadence::impl::HiFi::rsqrt_out
+
 - op: sigmoid.out
   kernels:
     - arg_meta: null
diff --git a/backends/cadence/hifi/kernels/CMakeLists.txt b/backends/cadence/hifi/kernels/CMakeLists.txt
index 9321cc544e..3d321443f8 100644
--- a/backends/cadence/hifi/kernels/CMakeLists.txt
+++ b/backends/cadence/hifi/kernels/CMakeLists.txt
@@ -9,10 +9,13 @@ add_library(
   cadence_kernels
   kernels.cpp
   ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/matmul_asym8uxasym8u_asym8u.cpp
+  ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_broadcast_32.c
   ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_add_f32_broadcast.c
   ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_div_f32_broadcast.c
   ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_div_mode_f32_broadcast.c
+  ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_minimum_maximum_f32.c
   ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_mul_f32_broadcast.c
+  ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_pow_f32.c
   ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_where_f32xf32_f32.c
   ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_reduce_32_32.c
 )
diff --git a/backends/cadence/hifi/kernels/kernels.h b/backends/cadence/hifi/kernels/kernels.h
index 2c915661f8..10927adc2a 100644
--- a/backends/cadence/hifi/kernels/kernels.h
+++ b/backends/cadence/hifi/kernels/kernels.h
@@ -15,6 +15,14 @@
 #include "xa_nnlib_kernels_api.h"
 
 /* Potential NNLIB function/APIs */
+
+extern "C" WORD32 xa_nn_broadcast_32_32(
+    WORD32* __restrict__ p_out,
+    const int* const out_shape,
+    WORD32* __restrict__ p_in,
+    const int* const in_shape,
+    int num_dims);
+
 extern "C" WORD32 xa_nn_elm_add_broadcast_4D_f32xf32_f32(
     FLOAT32* __restrict__ p_out,
     const WORD32* const p_out_shape,
@@ -47,6 +55,34 @@ extern "C" WORD32 xa_nn_elm_div_mode_broadcast_4D_f32xf32_f32(
     const WORD32* const p_inp2_shape,
     WORD32 mode);
 
+extern "C" WORD32 xa_nn_elm_maximum_f32xf32_f32(
+    FLOAT32* __restrict__ p_out,
+    const FLOAT32* __restrict__ p_inp1,
+    const FLOAT32* __restrict__ p_inp2,
+    WORD32 num_elm);
+
+extern "C" WORD32 xa_nn_elm_maximum_broadcast_4D_f32xf32_f32(
+    FLOAT32* __restrict__ p_out,
+    const WORD32* const p_out_shape,
+    const FLOAT32* __restrict__ p_inp1,
+    const WORD32* const p_inp1_shape,
+    const FLOAT32* __restrict__ p_inp2,
+    const WORD32* const p_inp2_shape);
+
+extern "C" WORD32 xa_nn_elm_minimum_f32xf32_f32(
+    FLOAT32* __restrict__ p_out,
+    const FLOAT32* __restrict__ p_inp1,
+    const FLOAT32* __restrict__ p_inp2,
+    WORD32 num_elm);
+
+extern "C" WORD32 xa_nn_elm_minimum_broadcast_4D_f32xf32_f32(
+    FLOAT32* __restrict__ p_out,
+    const WORD32* const p_out_shape,
+    const FLOAT32* __restrict__ p_inp1,
+    const WORD32* const p_inp1_shape,
+    const FLOAT32* __restrict__ p_inp2,
+    const WORD32* const p_inp2_shape);
+
 extern "C" WORD32 xa_nn_elm_mul_broadcast_4D_f32xf32_f32(
     FLOAT32* __restrict__ p_out,
     const WORD32* const p_out_shape,
@@ -55,6 +91,12 @@ extern "C" WORD32 xa_nn_elm_mul_broadcast_4D_f32xf32_f32(
     const FLOAT32* __restrict__ p_inp2,
     const WORD32* const p_inp2_shape);
 
+extern "C" void xa_nn_elm_pow_f32(
+    FLOAT32* restrict z,
+    const FLOAT32* restrict x,
+    const FLOAT32* restrict y,
+    WORD32 N);
+
 extern "C" WORD32 xa_nn_elm_where_f32xf32_f32(
     FLOAT32* __restrict__ p_out,
     const FLOAT32* __restrict__ p_inp1,
diff --git a/backends/cadence/hifi/operators/CMakeLists.txt b/backends/cadence/hifi/operators/CMakeLists.txt
index fc00345465..5e51f7fd3b 100644
--- a/backends/cadence/hifi/operators/CMakeLists.txt
+++ b/backends/cadence/hifi/operators/CMakeLists.txt
@@ -22,8 +22,12 @@ endif()
 set(_aten_ops__srcs
     "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_add.cpp"
     "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_div.cpp"
+    "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_maximum.cpp"
     "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_mean.cpp"
+    "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_minimum.cpp"
     "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_mul.cpp"
+    "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_pow.cpp"
+    "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_rsqrt.cpp"
     "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_sigmoid.cpp"
     "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_sub.cpp"
     "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_tanh.cpp"
diff --git a/backends/cadence/hifi/operators/op_maximum.cpp b/backends/cadence/hifi/operators/op_maximum.cpp
new file mode 100644
index 0000000000..f9a3658891
--- /dev/null
+++ b/backends/cadence/hifi/operators/op_maximum.cpp
@@ -0,0 +1,175 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/cadence/hifi/kernels/kernels.h>
+#include <executorch/kernels/portable/cpu/scalar_utils.h>
+#include <executorch/kernels/portable/cpu/util/broadcast_util.h>
+#include <executorch/kernels/portable/cpu/util/math_util.h>
+#include <executorch/runtime/kernel/kernel_includes.h>
+
+using exec_aten::ScalarType;
+using exec_aten::Tensor;
+using executorch::aten::RuntimeContext;
+using executorch::runtime::can_cast;
+using executorch::runtime::canCast;
+using executorch::runtime::CppTypeToScalarType;
+using executorch::runtime::promoteTypes;
+using torch::executor::apply_binary_elementwise_fn;
+using torch::executor::Error;
+using torch::executor::resize_to_broadcast_target_size;
+
+
+namespace cadence {
+namespace impl {
+namespace HiFi {
+namespace native {
+namespace {
+
+template <
+    bool can_cast,
+    typename CTYPE_A,
+    typename CTYPE_B,
+    typename CTYPE_IN,
+    typename CTYPE_OUT>
+struct MaximumInner;
+
+template <
+    typename CTYPE_A,
+    typename CTYPE_B,
+    typename CTYPE_IN,
+    typename CTYPE_OUT>
+struct MaximumInner<true, CTYPE_A, CTYPE_B, CTYPE_IN, CTYPE_OUT> {
+  static void run(const Tensor& a, const Tensor& b, Tensor& out) {
+    apply_binary_elementwise_fn<CTYPE_A, CTYPE_B, CTYPE_OUT>(
+        // NOLINTNEXTLINE(facebook-hte-ConstantArgumentPassByValue)
+        [](const CTYPE_A val_a, const CTYPE_B val_b) {
+          CTYPE_IN a_casted = static_cast<CTYPE_IN>(val_a);
+          CTYPE_IN b_casted = static_cast<CTYPE_IN>(val_b);
+          CTYPE_IN value =
+              torch::executor::native::utils::max_override(a_casted, b_casted);
+
+          return static_cast<CTYPE_OUT>(value);
+        },
+        a,
+        b,
+        out);
+  }
+};
+
+struct ReportCanCastBug {
+  static void run(const Tensor&, const Tensor&, Tensor&) {
+    ET_DCHECK_MSG(false, "BUG: canCast should have been checked above");
+  }
+};
+
+template <
+    typename CTYPE_A,
+    typename CTYPE_B,
+    typename CTYPE_IN,
+    typename CTYPE_OUT>
+struct MaximumInner<false, CTYPE_A, CTYPE_B, CTYPE_IN, CTYPE_OUT>
+    : public ReportCanCastBug {};
+
+} // namespace
+
+Tensor& maximum_out(
+    RuntimeContext& ctx,
+    const Tensor& a,
+    const Tensor& b,
+    Tensor& out) {
+  (void)ctx;
+
+  ET_KERNEL_CHECK(
+      ctx,
+      resize_to_broadcast_target_size(a, b, out) == Error::Ok,
+      InvalidArgument,
+      out);
+
+  constexpr int kNnlibMaxDim = 4; /*fallback if broadcast and dim > 4 */
+
+  ScalarType a_type = a.scalar_type();
+  ScalarType b_type = b.scalar_type();
+  ScalarType common_type = promoteTypes(a_type, b_type, /*half_to_float*/ true);
+  ScalarType out_type = out.scalar_type();
+
+  ET_KERNEL_CHECK(ctx, canCast(common_type, out_type), InvalidArgument, out);
+
+  bool optimized = true;
+  /*find broadcast*/
+  bool a_is_broadcasted = !out.sizes().equals(a.sizes());
+  bool b_is_broadcasted = !out.sizes().equals(b.sizes());
+  bool broadcast = (a_is_broadcasted || b_is_broadcasted);
+
+  int max_dim = a.dim() > b.dim() ? a.dim() : b.dim();
+  max_dim = out.dim() > max_dim ? out.dim() : max_dim;
+
+  if ((a_type != ScalarType::Float) || (b_type != ScalarType::Float))
+    optimized = false;
+  if ((broadcast == true) && (max_dim > kNnlibMaxDim))
+    optimized = false;
+
+  if (optimized) {
+    float* a_data = a.mutable_data_ptr<float>();
+    float* b_data = b.mutable_data_ptr<float>();
+    float* out_data = out.mutable_data_ptr<float>();
+
+    if (broadcast == true) {
+      int out_shape[kNnlibMaxDim];
+      int inp1_shape[kNnlibMaxDim];
+      int inp2_shape[kNnlibMaxDim];
+
+      for (int i = 0; i < kNnlibMaxDim; i++) {
+        out_shape[i] = 1;
+        inp1_shape[i] = 1;
+        inp2_shape[i] = 1;
+      }
+
+      int off_o = kNnlibMaxDim - out.dim();
+      int off_a = kNnlibMaxDim - a.dim();
+      int off_b = kNnlibMaxDim - b.dim();
+
+      for (int i = 0; i < out.dim(); i++) {
+        out_shape[i + off_o] = out.size(i);
+      }
+
+      for (int i = 0; i < a.dim(); i++)
+        inp1_shape[i + off_a] = a.size(i);
+
+      for (int i = 0; i < b.dim(); i++)
+        inp2_shape[i + off_b] = b.size(i);
+
+      xa_nn_elm_maximum_broadcast_4D_f32xf32_f32(
+          out_data, out_shape, a_data, inp1_shape, b_data, inp2_shape);
+    } else {
+      xa_nn_elm_maximum_f32xf32_f32(out_data, a_data, b_data, out.numel());
+    }
+    return out;
+  }
+  ET_SWITCH_REALHB_TYPES(a_type, ctx, "maximum.out", CTYPE_A, [&]() {
+    ET_SWITCH_REALHB_TYPES(b_type, ctx, "maximum.out", CTYPE_B, [&]() {
+      using CTYPE_IN = typename torch::executor::
+          promote_types<CTYPE_A, CTYPE_B, /*half_to_float*/ true>::type;
+      ET_DCHECK(CppTypeToScalarType<CTYPE_IN>::value == common_type);
+      ET_SWITCH_REALHB_TYPES(out_type, ctx, "maximum.out", CTYPE_OUT, [&]() {
+        MaximumInner<
+            can_cast<CTYPE_IN, CTYPE_OUT>::value,
+            CTYPE_A,
+            CTYPE_B,
+            CTYPE_IN,
+            CTYPE_OUT>::run(a, b, out);
+      });
+    });
+  });
+
+  return out;
+}
+
+} // namespace native
+} // namespace HiFi
+} // namespace impl
+} // namespace cadence
diff --git a/backends/cadence/hifi/operators/op_minimum.cpp b/backends/cadence/hifi/operators/op_minimum.cpp
new file mode 100644
index 0000000000..6f81ad5c3e
--- /dev/null
+++ b/backends/cadence/hifi/operators/op_minimum.cpp
@@ -0,0 +1,173 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/cadence/hifi/kernels/kernels.h>
+#include <executorch/kernels/portable/cpu/scalar_utils.h>
+#include <executorch/kernels/portable/cpu/util/broadcast_util.h>
+#include <executorch/kernels/portable/cpu/util/math_util.h>
+#include <executorch/runtime/kernel/kernel_includes.h>
+
+using exec_aten::ScalarType;
+using exec_aten::Tensor;
+using executorch::aten::RuntimeContext;
+using executorch::runtime::can_cast;
+using executorch::runtime::canCast;
+using executorch::runtime::CppTypeToScalarType;
+using executorch::runtime::promoteTypes;
+using torch::executor::apply_binary_elementwise_fn;
+using torch::executor::Error;
+using torch::executor::resize_to_broadcast_target_size;
+
+namespace cadence {
+namespace impl {
+namespace HiFi {
+namespace native {
+namespace {
+
+template <
+    bool can_cast,
+    typename CTYPE_A,
+    typename CTYPE_B,
+    typename CTYPE_IN,
+    typename CTYPE_OUT>
+struct MinimumInner;
+
+template <
+    typename CTYPE_A,
+    typename CTYPE_B,
+    typename CTYPE_IN,
+    typename CTYPE_OUT>
+struct MinimumInner<true, CTYPE_A, CTYPE_B, CTYPE_IN, CTYPE_OUT> {
+  static void run(const Tensor& a, const Tensor& b, Tensor& out) {
+    apply_binary_elementwise_fn<CTYPE_A, CTYPE_B, CTYPE_OUT>(
+        // NOLINTNEXTLINE(facebook-hte-ConstantArgumentPassByValue)
+        [](const CTYPE_A val_a, const CTYPE_B val_b) {
+          CTYPE_IN a_casted = static_cast<CTYPE_IN>(val_a);
+          CTYPE_IN b_casted = static_cast<CTYPE_IN>(val_b);
+          CTYPE_IN value =
+              torch::executor::native::utils::min_override(a_casted, b_casted);
+
+          return static_cast<CTYPE_OUT>(value);
+        },
+        a,
+        b,
+        out);
+  }
+};
+
+struct ReportCanCastBug {
+  static void run(const Tensor&, const Tensor&, Tensor&) {
+    ET_DCHECK_MSG(false, "BUG: canCast should have been checked above");
+  }
+};
+
+template <
+    typename CTYPE_A,
+    typename CTYPE_B,
+    typename CTYPE_IN,
+    typename CTYPE_OUT>
+struct MinimumInner<false, CTYPE_A, CTYPE_B, CTYPE_IN, CTYPE_OUT>
+    : public ReportCanCastBug {};
+
+} // namespace
+
+Tensor& minimum_out(
+    RuntimeContext& ctx,
+    const Tensor& a,
+    const Tensor& b,
+    Tensor& out) {
+  (void)ctx;
+
+  ET_KERNEL_CHECK(
+      ctx,
+      resize_to_broadcast_target_size(a, b, out) == Error::Ok,
+      InvalidArgument,
+      out);
+
+  constexpr int kNnlibMaxDim = 4; /*fallback if broadcast and dim > 4 */
+
+  ScalarType a_type = a.scalar_type();
+  ScalarType b_type = b.scalar_type();
+  ScalarType common_type = promoteTypes(a_type, b_type, /*half_to_float*/ true);
+  ScalarType out_type = out.scalar_type();
+
+  ET_KERNEL_CHECK(ctx, canCast(common_type, out_type), InvalidArgument, out);
+
+  bool optimized = true;
+  /*find broadcast*/
+  const bool a_is_broadcasted = !out.sizes().equals(a.sizes());
+  const bool b_is_broadcasted = !out.sizes().equals(b.sizes());
+  const bool broadcast = (a_is_broadcasted || b_is_broadcasted);
+
+  int max_dim = a.dim() > b.dim() ? a.dim() : b.dim();
+  max_dim = out.dim() > max_dim ? out.dim() : max_dim;
+
+  if ((a_type != ScalarType::Float) || (b_type != ScalarType::Float))
+    optimized = false;
+  if ((broadcast == true) && (max_dim > kNnlibMaxDim))
+    optimized = false;
+
+  if (optimized) {
+    float* a_data = a.mutable_data_ptr<float>();
+    float* b_data = b.mutable_data_ptr<float>();
+    float* out_data = out.mutable_data_ptr<float>();
+
+    if (broadcast == true) {
+      int out_shape[kNnlibMaxDim];
+      int inp1_shape[kNnlibMaxDim];
+      int inp2_shape[kNnlibMaxDim];
+
+      for (int i = 0; i < kNnlibMaxDim; i++) {
+        out_shape[i] = 1;
+        inp1_shape[i] = 1;
+        inp2_shape[i] = 1;
+      }
+
+      int off_o = kNnlibMaxDim - out.dim();
+      int off_a = kNnlibMaxDim - a.dim();
+      int off_b = kNnlibMaxDim - b.dim();
+
+      for (int i = 0; i < out.dim(); i++) {
+        out_shape[i + off_o] = out.size(i);
+      }
+
+      for (int i = 0; i < a.dim(); i++)
+        inp1_shape[i + off_a] = a.size(i);
+
+      for (int i = 0; i < b.dim(); i++)
+        inp2_shape[i + off_b] = b.size(i);
+
+      xa_nn_elm_minimum_broadcast_4D_f32xf32_f32(
+          out_data, out_shape, a_data, inp1_shape, b_data, inp2_shape);
+    } else {
+      xa_nn_elm_minimum_f32xf32_f32(out_data, a_data, b_data, out.numel());
+    }
+    return out;
+  }
+  ET_SWITCH_REALHB_TYPES(a_type, ctx, "minimum.out", CTYPE_A, [&]() {
+    ET_SWITCH_REALHB_TYPES(b_type, ctx, "minimum.out", CTYPE_B, [&]() {
+      using CTYPE_IN = typename torch::executor::
+          promote_types<CTYPE_A, CTYPE_B, /*half_to_float*/ true>::type;
+      ET_DCHECK(CppTypeToScalarType<CTYPE_IN>::value == common_type);
+      ET_SWITCH_REALHB_TYPES(out_type, ctx, "minimum.out", CTYPE_OUT, [&]() {
+        MinimumInner<
+            can_cast<CTYPE_IN, CTYPE_OUT>::value,
+            CTYPE_A,
+            CTYPE_B,
+            CTYPE_IN,
+            CTYPE_OUT>::run(a, b, out);
+      });
+    });
+  });
+  return out;
+}
+
+} // namespace native
+} // namespace HiFi
+} // namespace impl
+} // namespace cadence
diff --git a/backends/cadence/hifi/operators/op_pow.cpp b/backends/cadence/hifi/operators/op_pow.cpp
new file mode 100644
index 0000000000..9669e96123
--- /dev/null
+++ b/backends/cadence/hifi/operators/op_pow.cpp
@@ -0,0 +1,354 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <cmath>
+
+#include <executorch/backends/cadence/hifi/kernels/kernels.h>
+#include <executorch/kernels/portable/cpu/scalar_utils.h>
+#include <executorch/kernels/portable/cpu/util/broadcast_util.h>
+#include <executorch/kernels/portable/cpu/util/functional_util.h>
+#include <executorch/runtime/core/exec_aten/util/scalar_type_util.h>
+#include <executorch/runtime/kernel/kernel_includes.h>
+
+using exec_aten::Scalar;
+using exec_aten::ScalarType;
+using exec_aten::Tensor;
+using executorch::runtime::can_cast;
+using executorch::runtime::canCast;
+using executorch::runtime::CppTypeToScalarType;
+using executorch::runtime::KernelRuntimeContext;
+using executorch::runtime::promoteTypes;
+using torch::executor::Error;
+using torch::executor::resize_to_broadcast_target_size;
+
+namespace cadence {
+namespace impl {
+namespace HiFi {
+namespace native {
+
+namespace {
+template <
+    bool can_cast,
+    typename CTYPE_A,
+    typename CTYPE_B,
+    typename CTYPE_IN,
+    typename CTYPE_OUT>
+struct PowInner;
+
+template <
+    typename CTYPE_A,
+    typename CTYPE_B,
+    typename CTYPE_IN,
+    typename CTYPE_OUT>
+struct PowInner<true, CTYPE_A, CTYPE_B, CTYPE_IN, CTYPE_OUT> {
+  static void run(const Tensor& a, const Tensor& b, Tensor& out) {
+    torch::executor::apply_binary_elementwise_fn<CTYPE_A, CTYPE_B, CTYPE_OUT>(
+        // NOLINTNEXTLINE(facebook-hte-ConstantArgumentPassByValue)
+        [](const CTYPE_A val_a, const CTYPE_B val_b) {
+          CTYPE_IN a_casted = static_cast<CTYPE_IN>(val_a);
+          CTYPE_IN b_casted = static_cast<CTYPE_IN>(val_b);
+          CTYPE_IN value = std::pow(a_casted, b_casted);
+          return static_cast<CTYPE_OUT>(value);
+        },
+        a,
+        b,
+        out);
+  }
+};
+
+struct ReportCanCastBug {
+  static void run(const Tensor&, const Tensor&, Tensor&) {
+    ET_DCHECK_MSG(false, "BUG: canCast should have been checked above");
+  }
+};
+
+template <
+    typename CTYPE_A,
+    typename CTYPE_B,
+    typename CTYPE_IN,
+    typename CTYPE_OUT>
+struct PowInner<false, CTYPE_A, CTYPE_B, CTYPE_IN, CTYPE_OUT>
+    : public ReportCanCastBug {};
+
+} // namespace
+
+Tensor& pow_Tensor_Tensor_out(
+    KernelRuntimeContext& ctx,
+    const Tensor& a,
+    const Tensor& b,
+    Tensor& out) {
+  // Determine output size and resize for dynamic shapes
+  ET_KERNEL_CHECK(
+      ctx,
+      resize_to_broadcast_target_size(a, b, out) == Error::Ok,
+      InvalidArgument,
+      out);
+
+  ScalarType a_type = a.scalar_type();
+  ScalarType b_type = b.scalar_type();
+  ScalarType common_type = promoteTypes(a_type, b_type, /*half_to_float*/ true);
+  ScalarType out_type = out.scalar_type();
+
+  ET_KERNEL_CHECK(
+      ctx, common_type != exec_aten::ScalarType::Bool, InvalidArgument, out);
+  ET_KERNEL_CHECK(ctx, canCast(common_type, out_type), InvalidArgument, out);
+
+  constexpr auto name = "pow.Tensor_Tensor_out";
+  constexpr int kNnlibMaxDim = 16;
+  int a_dim = a.dim(), b_dim = b.dim(), out_dim = out.dim();
+  bool optimized = true;
+
+  const bool a_is_broadcasted = !out.sizes().equals(a.sizes());
+  const bool b_is_broadcasted = !out.sizes().equals(b.sizes());
+  const bool broadcast = (a_is_broadcasted && b_is_broadcasted);
+  int max_dim = a.dim() > b.dim() ? a.dim() : b.dim();
+  max_dim = out.dim() > max_dim ? out.dim() : max_dim;
+
+  if (out_type != ScalarType::Float)
+    optimized = false;
+
+  if (max_dim > kNnlibMaxDim)
+    optimized = false;
+
+  WORD32 num_elm = out.numel();
+
+  if (optimized) {
+    if (broadcast) {
+      WORD32* __restrict__ ptr1 =
+          (WORD32* __restrict__)malloc(num_elm * sizeof(WORD32));
+      WORD32* __restrict__ ptr2 =
+          (WORD32* __restrict__)malloc(num_elm * sizeof(WORD32));
+
+      WORD32* __restrict__ pin1 =
+          (WORD32* __restrict__)a.const_data_ptr<float>();
+      WORD32* __restrict__ pin2 =
+          (WORD32* __restrict__)b.const_data_ptr<float>();
+
+      WORD32 p_out_shape[kNnlibMaxDim];
+      WORD32 p_inp1_shape[kNnlibMaxDim];
+      WORD32 p_inp2_shape[kNnlibMaxDim];
+
+      for (int i = 0; i < out_dim; i++)
+        p_out_shape[i] = out.size(i);
+      for (int i = 0; i < a_dim; i++)
+        p_inp1_shape[i] = a.size(i);
+      for (int i = 0; i < b_dim; i++)
+        p_inp2_shape[i] = b.size(i);
+
+      xa_nn_broadcast_32_32(ptr1, p_out_shape, pin1, p_inp1_shape, out_dim);
+
+      xa_nn_broadcast_32_32(ptr2, p_out_shape, pin2, p_inp2_shape, out_dim);
+
+      FLOAT32* __restrict__ p_out =
+          (FLOAT32* __restrict__)out.mutable_data_ptr<float>();
+      const FLOAT32* __restrict__ p_inp1 = (const FLOAT32* __restrict__)ptr1;
+      const FLOAT32* __restrict__ p_inp2 = (const FLOAT32* __restrict__)ptr2;
+
+      xa_nn_elm_pow_f32(p_out, p_inp1, p_inp2, num_elm);
+
+      free(ptr1);
+      free(ptr2);
+    } else if (a_is_broadcasted && (!b_is_broadcasted)) {
+      FLOAT32* __restrict__ ptr1 =
+          (FLOAT32* __restrict__)malloc((num_elm + 2) * sizeof(WORD32));
+
+      FLOAT32* __restrict__ pin1 =
+          (FLOAT32* __restrict__)a.const_data_ptr<float>();
+
+      WORD32 p_out_shape[kNnlibMaxDim];
+      WORD32 p_inp1_shape[kNnlibMaxDim];
+
+      for (int i = 0; i < out_dim; i++)
+        p_out_shape[i] = out.size(i);
+      for (int i = 0; i < a_dim; i++)
+        p_inp1_shape[i] = a.size(i);
+
+      xa_nn_broadcast_32_32(
+          (WORD32*)ptr1, p_out_shape, (WORD32*)pin1, p_inp1_shape, out_dim);
+
+      FLOAT32* __restrict__ p_out =
+          (FLOAT32* __restrict__)out.mutable_data_ptr<float>();
+      const FLOAT32* __restrict__ p_inp1 = (const FLOAT32* __restrict__)ptr1;
+      const FLOAT32* __restrict__ p_inp2 =
+          (const FLOAT32* __restrict__)b.const_data_ptr<float>();
+
+      xa_nn_elm_pow_f32(p_out, p_inp1, p_inp2, num_elm);
+
+      free(ptr1);
+    } else if (b_is_broadcasted && (!a_is_broadcasted)) {
+      WORD32* __restrict__ ptr1 =
+          (WORD32* __restrict__)malloc(num_elm * sizeof(WORD32));
+
+      WORD32* __restrict__ pin1 =
+          (WORD32* __restrict__)b.const_data_ptr<float>();
+
+      WORD32 p_out_shape[kNnlibMaxDim];
+      WORD32 p_inp1_shape[kNnlibMaxDim];
+
+      for (int i = 0; i < out_dim; i++)
+        p_out_shape[i] = out.size(i);
+      for (int i = 0; i < b_dim; i++)
+        p_inp1_shape[i] = b.size(i);
+
+      xa_nn_broadcast_32_32(ptr1, p_out_shape, pin1, p_inp1_shape, out_dim);
+
+      FLOAT32* __restrict__ p_out =
+          (FLOAT32* __restrict__)out.mutable_data_ptr<float>();
+      const FLOAT32* __restrict__ p_inp1 =
+          (const FLOAT32* __restrict__)a.const_data_ptr<float>();
+      const FLOAT32* __restrict__ p_inp2 = (const FLOAT32* __restrict__)ptr1;
+
+      xa_nn_elm_pow_f32(p_out, p_inp1, p_inp2, num_elm);
+
+      free(ptr1);
+    } else {
+      FLOAT32* __restrict__ p_out =
+          (FLOAT32* __restrict__)out.mutable_data_ptr<float>();
+      const FLOAT32* __restrict__ p_inp1 =
+          (const FLOAT32* __restrict__)a.const_data_ptr<float>();
+      const FLOAT32* __restrict__ p_inp2 =
+          (const FLOAT32* __restrict__)b.const_data_ptr<float>();
+
+      xa_nn_elm_pow_f32(p_out, p_inp1, p_inp2, num_elm);
+    }
+    return out;
+  }
+
+  ET_SWITCH_REALHB_TYPES(a_type, ctx, name, CTYPE_A, [&]() {
+    ET_SWITCH_REALHB_TYPES(b_type, ctx, name, CTYPE_B, [&]() {
+      using CTYPE_IN = typename torch::executor::
+          promote_types<CTYPE_A, CTYPE_B, /*half_to_float*/ true>::type;
+      ET_DCHECK(CppTypeToScalarType<CTYPE_IN>::value == common_type);
+      ET_SWITCH_REALH_TYPES(out_type, ctx, name, CTYPE_OUT, [&]() {
+        PowInner<
+            !std::is_same<CTYPE_IN, bool>::value &&
+                can_cast<CTYPE_IN, CTYPE_OUT>::value,
+            CTYPE_A,
+            CTYPE_B,
+            CTYPE_IN,
+            CTYPE_OUT>::run(a, b, out);
+      });
+    });
+  });
+
+  return out;
+}
+
+Tensor& pow_Tensor_Scalar_out(
+    KernelRuntimeContext& ctx,
+    const Tensor& a,
+    const Scalar& b,
+    Tensor& out) {
+  (void)ctx;
+
+  // Resize for dynamic shape
+  ET_KERNEL_CHECK_MSG(
+      ctx,
+      resize_tensor(out, a.sizes()) == Error::Ok,
+      InvalidArgument,
+      out,
+      "Failed to resize output tensor.");
+
+  ScalarType a_type = a.scalar_type();
+  ScalarType b_type = torch::executor::native::utils::get_scalar_dtype(b);
+  ScalarType common_type =
+      torch::executor::native::utils::promote_type_with_scalar(
+          a_type, b, /*half_to_float*/ false);
+  ScalarType out_type = out.scalar_type();
+
+  ET_KERNEL_CHECK(ctx, common_type == out_type, InvalidArgument, out);
+
+  constexpr auto name = "pow.Tensor_Scalar_out";
+  if (common_type == ScalarType::Half) {
+    common_type = ScalarType::Float;
+  }
+
+  ET_SWITCH_REALHB_TYPES(a_type, ctx, name, CTYPE_A, [&]() {
+    ET_SWITCH_SCALAR_OBJ_TYPES(b_type, ctx, name, CTYPE_B, [&]() {
+      ET_SWITCH_REAL_TYPES(common_type, ctx, name, CTYPE_IN, [&]() {
+        ET_SWITCH_REALH_TYPES(out_type, ctx, name, CTYPE_OUT, [&]() {
+          CTYPE_B val_b = 0;
+          torch::executor::native::utils::extract_scalar(b, &val_b);
+          torch::executor::apply_unary_map_fn(
+              [val_b](const CTYPE_A val_a) {
+                CTYPE_IN a_casted = static_cast<CTYPE_IN>(val_a);
+                CTYPE_IN b_casted = static_cast<CTYPE_IN>(val_b);
+                CTYPE_IN value = std::pow(a_casted, b_casted);
+
+                return static_cast<CTYPE_OUT>(value);
+              },
+              a.const_data_ptr<CTYPE_A>(),
+              out.mutable_data_ptr<CTYPE_OUT>(),
+              out.numel());
+        });
+      });
+    });
+  });
+
+  return out;
+}
+
+Tensor& pow_Scalar_out(
+    KernelRuntimeContext& ctx,
+    const Scalar& a,
+    const Tensor& b,
+    Tensor& out) {
+  (void)ctx;
+
+  // Resize for dynamic shape
+  ET_KERNEL_CHECK_MSG(
+      ctx,
+      resize_tensor(out, b.sizes()) == Error::Ok,
+      InvalidArgument,
+      out,
+      "Failed to resize output tensor.");
+
+  ScalarType a_type = torch::executor::native::utils::get_scalar_dtype(a);
+  ScalarType b_type = b.scalar_type();
+  ScalarType common_type =
+      torch::executor::native::utils::promote_type_with_scalar(
+          b_type, a, /*half_to_float*/ false);
+  ScalarType out_type = out.scalar_type();
+
+  ET_KERNEL_CHECK(ctx, common_type == out_type, InvalidArgument, out);
+
+  constexpr auto name = "pow.Scalar_out";
+  if (common_type == ScalarType::Half) {
+    common_type = ScalarType::Float;
+  }
+
+  ET_SWITCH_SCALAR_OBJ_TYPES(a_type, ctx, name, CTYPE_A, [&]() {
+    ET_SWITCH_REALHB_TYPES(b_type, ctx, name, CTYPE_B, [&]() {
+      ET_SWITCH_REAL_TYPES(common_type, ctx, name, CTYPE_IN, [&]() {
+        ET_SWITCH_REALH_TYPES(out_type, ctx, name, CTYPE_OUT, [&]() {
+          CTYPE_A val_a = 0;
+          torch::executor::native::utils::extract_scalar(a, &val_a);
+
+          torch::executor::apply_unary_map_fn(
+              [val_a](const CTYPE_B val_b) {
+                CTYPE_IN a_casted = static_cast<CTYPE_IN>(val_a);
+                CTYPE_IN b_casted = static_cast<CTYPE_IN>(val_b);
+                CTYPE_IN value = std::pow(a_casted, b_casted);
+                return static_cast<CTYPE_OUT>(value);
+              },
+              b.const_data_ptr<CTYPE_B>(),
+              out.mutable_data_ptr<CTYPE_OUT>(),
+              out.numel());
+        });
+      });
+    });
+  });
+
+  return out;
+}
+
+} // namespace native
+} // namespace HiFi
+} // namespace impl
+} // namespace cadence
+
diff --git a/backends/cadence/hifi/operators/op_rsqrt.cpp b/backends/cadence/hifi/operators/op_rsqrt.cpp
new file mode 100644
index 0000000000..1cf717988a
--- /dev/null
+++ b/backends/cadence/hifi/operators/op_rsqrt.cpp
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/kernels/portable/cpu/pattern/pattern.h>
+#include <executorch/runtime/kernel/kernel_includes.h>
+
+#include <executorch/backends/cadence/hifi/kernels/kernels.h>
+
+using exec_aten::ScalarType;
+using exec_aten::Tensor;
+using executorch::aten::RuntimeContext;
+
+namespace cadence {
+namespace impl {
+namespace HiFi {
+namespace native {
+namespace {
+
+double rsqrt(double x) {
+  return 1.0 / std::sqrt(x);
+}
+
+} // namespace
+
+Tensor& rsqrt_out(RuntimeContext& ctx, const Tensor& in, Tensor& out) {
+  bool optimized = true;
+
+  if (out.scalar_type() != ScalarType::Float)
+    optimized = false;
+
+  if (optimized) {
+    WORD32 num_elm = out.numel();
+
+    FLOAT32* __restrict__ p_out =
+        (FLOAT32* __restrict__)out.mutable_data_ptr<float>();
+    const FLOAT32* __restrict__ p_inp =
+        (const FLOAT32* __restrict__)in.const_data_ptr<float>();
+
+    xa_nn_elm_rsqrt_f32_f32(p_out, p_inp, num_elm);
+    return out;
+  }
+
+  return torch::executor::native::internal::
+      unary_ufunc_realhbbf16_to_floathbf16(rsqrt, ctx, in, out);
+}
+
+} // namespace native
+} // namespace HiFi
+} // namespace impl
+} // namespace cadence
diff --git a/backends/cadence/hifi/operators/quantized_linear_out.cpp b/backends/cadence/hifi/operators/quantized_linear_out.cpp
index 0f56a1a963..accc610132 100644
--- a/backends/cadence/hifi/operators/quantized_linear_out.cpp
+++ b/backends/cadence/hifi/operators/quantized_linear_out.cpp
@@ -26,6 +26,9 @@ using ::executorch::aten::Tensor;
 using ::executorch::runtime::getLeadingDims;
 using ::executorch::runtime::KernelRuntimeContext;
 
+
+    // The nnlib kernel to compute quantized linear via matmul.
+
 void _quantized_linear_asym8u(
     const Tensor& in,
     const Tensor& weight,
@@ -37,37 +40,30 @@ void _quantized_linear_asym8u(
     int64_t out_zero_point,
     __ET_UNUSED const optional<Tensor>& offset,
     Tensor& out) {
-  // input comes in shape [leading_dims, in_dim]
-  // weight comes in shape [out_dim, in_dim]
-  // output comes in empty with shape [leading_dims, out_dim]
-  // Perform matrix multiply (M x N) x (N x P)' => M x P
   const int64_t leading_dims = getLeadingDims(in, in.dim() - 1);
   const int64_t out_dim = weight.size(0); // = out_dim
   const int64_t in_dim = weight.size(1); // = in_dim
-
   const uint8_t* __restrict__ in_data = in.const_data_ptr<uint8_t>();
   const uint8_t* __restrict__ weight_data = weight.const_data_ptr<uint8_t>();
   const int32_t* __restrict__ bias_data = bias.const_data_ptr<int32_t>();
   uint8_t* __restrict__ out_data = out.mutable_data_ptr<uint8_t>();
-
-  // The nnlib kernel to compute quantized linear via matmul.
   int32_t ret = xa_nn_matmul_asym8uxasym8u_asym8u(
-      out_data, // p_out
-      weight_data, // p_mat1,
-      in_data, // p_mat2,
-      bias_data, // p_bias
-      out_dim, // rows of p_mat1
-      in_dim, // cols of p_mat1
-      in_dim, // row_stride of p_mat1
-      leading_dims, // vec_count, i.e., rows of p_mat2
-      in_dim, // vec_offset of p_mat2.
-      out_dim, // out_offset, i.e., offset of next output element written
-      1, // out_stride, i.e., stride to go to next output row
+        out_data,
+        weight_data,
+        in_data,
+        bias_data,
+        out_dim,
+        in_dim,
+        in_dim,
+        leading_dims,
+        in_dim,
+        out_dim,
+        1,
       -weight_zero_point.const_data_ptr<int32_t>()[0], // mat1_zero_bias
       -in_zero_point, // mat2_zero_bias
-      out_multiplier.const_data_ptr<int32_t>()[0], // out_multiplier
-      out_shift.const_data_ptr<int32_t>()[0], // out_shift
-      out_zero_point); // out_zero_bias
+        out_multiplier.const_data_ptr<int32_t>()[0],
+        out_shift.const_data_ptr<int32_t>()[0],
+        out_zero_point);
   ET_DCHECK_MSG(ret == 0, "HiFi quantized::linear failed");
 }
 
diff --git a/backends/cadence/hifi/third-party/nnlib/xa_nn_broadcast_32.c b/backends/cadence/hifi/third-party/nnlib/xa_nn_broadcast_32.c
new file mode 100644
index 0000000000..cad3f1a25b
--- /dev/null
+++ b/backends/cadence/hifi/third-party/nnlib/xa_nn_broadcast_32.c
@@ -0,0 +1,313 @@
+/*******************************************************************************
+* Copyright (c) 2018-2024 Cadence Design Systems, Inc.
+*
+* Permission is hereby granted, free of charge, to any person obtaining
+* a copy of this software and associated documentation files (the
+* "Software"), to use this Software with Cadence processor cores only and
+* not with any other processors and platforms, subject to
+* the following conditions:
+*
+* The above copyright notice and this permission notice shall be included
+* in all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+******************************************************************************/
+/*
+ * xa_nn_broadcast_8_8.c
+ */
+
+#include "xa_nnlib_common.h"
+//#include "xa_nn_basic_state.h"
+
+#include<string.h>
+#include<stdbool.h>
+
+#include "stdio.h"
+
+/*
+ * This file is sourced from ../hifi5/xa_nn_broadcast_8_8.c
+ */
+
+#define NUMDIMS_MAX 8
+
+typedef struct bcast_expansion_struct_{
+    size_t load_num_elem;
+    int    replicate_loadedElm_times;
+    int    repeat_operation;
+} bcast_expansion_rule ;
+
+WORD32* broadcast_node_32(bcast_expansion_rule *steps, unsigned int step_id,
+        WORD32 *dst, WORD32 *src);
+
+void *xa_nn_memcpy(void * dest1,const void *src1, size_t n1)
+{
+  char *dest = (char *)dest1;
+  char *src = (char *)src1;
+  int n = (int)n1;
+  ae_int16x4 * __restrict d_align_addr, * __restrict s_align_addr;
+  int i;
+  void *orig_dest = dest;
+
+  if (n < 32) {
+    return memcpy(dest, src, n);
+  }
+
+  if ( !(((int) dest) %8) && !(((int) src) %8)) { // 64-bit aligned
+    s_align_addr = (ae_int16x4 *) src;
+    d_align_addr = (ae_int16x4 *) dest;
+    for (i=0; i<n>>3; i++) {
+        d_align_addr[i] = s_align_addr[i];
+    }
+
+    for (i=(n&~7); i<n; i++) {
+      dest[i] = src[i];
+    }
+    return orig_dest;
+  }
+
+  if ( (((int) dest) %2) || (((int) src) %2)) { // 16-bit aligned
+    if ( (((int) dest) %2) && (((int) src) %2)) { // 16-bit aligned
+      *dest++ = *src++;
+       n--;
+    } else {
+      #if 0
+      return memcpy(dest, src, n);
+      #else
+        ae_int32x2 *pOut = (ae_int32x2 *)dest;
+        ae_int32x2 *pInp = (ae_int32x2 *)src;
+        ae_valign alignIn, alignOut;
+        alignIn = AE_LA64_PP(pInp);
+        alignOut = AE_ZALIGN64();
+        ae_int24x2 d0;
+        int Nby6 =  AE_MOVAD32_H(AE_MOVINT32X2_FROMINT64(AE_MUL32_LL(n, 0x2AAAAAAB)));
+        int remainder_start = 6*Nby6;
+
+        for(i=0;i<Nby6;i++)
+        {
+          AE_LA24X2_IP(d0, alignIn, pInp);
+          AE_SA24X2_IP(d0, alignOut, pOut);
+        }
+        AE_SA64POS_FP(alignOut, pOut);
+        /* remainder loop */
+        for(i=remainder_start; i < n; i++){
+          dest[i] = src[i];
+      }
+      return orig_dest;
+      #endif
+    }
+  }
+  int n2 = n/2;
+  ae_valign d_align = AE_ZALIGN64();
+  d_align_addr = (ae_int16x4 *) dest;
+  s_align_addr = (ae_int16x4 *) src;
+  ae_valign s_align = AE_LA64_PP(s_align_addr);
+  ae_int16x4 t,t2;
+  for (i=0; i<n2>>3; i++) {
+      AE_LA16X4_IP(t, s_align, s_align_addr);
+      AE_LA16X4_IP(t2, s_align, s_align_addr);
+      AE_SA16X4_IP(t, d_align, d_align_addr);
+      AE_SA16X4_IP(t2, d_align, d_align_addr);
+  }
+  AE_SA64POS_FP(d_align, d_align_addr);
+  ae_int16 *s_src = (ae_int16 *) src;
+  ae_int16 *s_dest = (ae_int16 *) dest;
+  for (i=8*i; i<n2; i++) {
+    s_dest[i] = s_src[i];
+  }
+  if (n % 2) {
+    dest[n-1] = src[n-1];
+  }
+  return orig_dest;
+} /* xa_nn_memcpy */
+
+WORD32 xa_nn_broadcast_32_32( WORD32* __restrict__ p_out,      /* pointer to write broadcasted output data to */
+        const int *const out_shape,         /* output shape resulting after broadcast */
+
+        WORD32* __restrict__ p_in,    /* pointer to unextended input data */
+        const int * const in_shape,         /* input shape */
+        int num_dims)
+{
+
+    /* NULL pointer checks */
+    XA_NNLIB_ARG_CHK_PTR(p_out, -1);
+    XA_NNLIB_ARG_CHK_PTR(out_shape, -1);
+    XA_NNLIB_ARG_CHK_PTR(p_in, -1);
+    XA_NNLIB_ARG_CHK_PTR(in_shape, -1);
+
+    /* IO shape pointer alignment checks */
+    XA_NNLIB_ARG_CHK_ALIGN(in_shape, sizeof(WORD32), -1);
+    XA_NNLIB_ARG_CHK_ALIGN(out_shape, sizeof(WORD32), -1);
+
+    /* Check if number of dims is valid */
+    XA_NNLIB_ARG_CHK_COND(num_dims<=0 || num_dims>8, -1);
+
+    int i = 0;
+
+    /* Check for valid IO shapes */
+    for(i=0; i<num_dims; i++){
+        XA_NNLIB_CHK_COND(in_shape[i]<=0, -1);
+        XA_NNLIB_CHK_COND(out_shape[i]<=0, -1);
+    }
+
+    /* Check if input shape can be broadcasted to requested output shape */
+    for(i=0; i<num_dims; i++){
+        if(in_shape[i] != out_shape[i]){
+            /* in_shape is either same as out_shape or 1 */
+            XA_NNLIB_CHK_COND( in_shape[i] != 1, -1);
+        }
+    }
+
+    /* bcast_expansion_steps contains a sequence to steps execute for a broadcast op */
+    bcast_expansion_rule bcast_expansion_steps[NUMDIMS_MAX] = {{0}};
+
+    int k=0;
+    int dim=0;
+    const void *res=0;
+
+    int num_elem_load = 1;
+    int num_copy_times = 1;
+    int num_repeat = 1;
+
+    dim = num_dims-1;
+    while(dim>=0){
+
+        /* Find the sub-matrix size */
+        while(in_shape[dim] != 1 && dim>=0){
+            num_elem_load *= out_shape[dim];
+            dim--;
+        }
+
+        /* Find the number of times this sub-matrix needs to be copied */
+        num_copy_times = 1;
+        while(in_shape[dim] == 1 && dim>=0){
+            num_copy_times *= out_shape[dim];
+            dim--;
+        }
+
+        /* Find the number of times the above copy needs to be repeated */
+        num_repeat = 1;
+        while(in_shape[dim] != 1 && dim>=0){
+            num_repeat *= 1 * out_shape[dim];
+            dim--;
+        }
+
+        bcast_expansion_steps[k].load_num_elem  = num_elem_load;
+        bcast_expansion_steps[k].replicate_loadedElm_times = num_copy_times;
+        bcast_expansion_steps[k].repeat_operation = num_repeat;
+        k++;
+
+        num_elem_load = num_elem_load * num_copy_times * num_repeat;
+    }
+
+    res = broadcast_node_32(bcast_expansion_steps, num_dims-1,
+            p_out, p_in);
+    (void)res; /* Unused return value */
+
+    return 0;
+}
+
+WORD32* broadcast_node_32(bcast_expansion_rule *steps, unsigned int step_id,
+        WORD32 *dst, WORD32 *src) {
+    int step_itr=0, rep_itr=0;
+    int i=0, j=0, k=0;
+    bcast_expansion_rule *step = NULL;
+
+    // ignore steps that are null
+    while(steps[step_id].repeat_operation == 0 && step_id>0){
+        step_id--;
+    }
+
+    // step is now the parent node for this iteration
+    step = &steps[step_id];
+    size_t numLoadedElm = step->load_num_elem;
+
+    WORD32 *cp_dst = dst;
+    WORD32 *cp_src = src;
+    WORD32 *cp_src_temp=NULL;
+    WORD32 *cp_dst_temp=NULL;
+
+    if(numLoadedElm>32){
+        if(step_id > 0){
+            for(step_itr=0; step_itr<step->repeat_operation; step_itr++){
+                src = broadcast_node_32(steps, step_id-1, dst, src);
+                cp_src = dst;
+                cp_dst = dst + numLoadedElm;
+                for(rep_itr=1; rep_itr<step->replicate_loadedElm_times; rep_itr++){
+                    xa_nn_memcpy(cp_dst, cp_src, 4 * numLoadedElm);
+                    cp_dst += numLoadedElm;
+                }
+                dst = cp_dst;
+            }
+            return src;
+        } else {
+            if(numLoadedElm == 1){
+                for(j=0; j<step->repeat_operation; j++){
+//                    memset((void*)cp_dst, (void*)cp_src, 4 * step->replicate_loadedElm_times);
+                	for(i = 0; i < step->replicate_loadedElm_times; i++)
+                		cp_dst[i] = cp_src[0];
+                    cp_dst += step->replicate_loadedElm_times;
+                    cp_src++;
+                }
+            } else {
+                for(j=0; j<step->repeat_operation; j++){
+                    for(i=0; i<step->replicate_loadedElm_times; i++){
+                        xa_nn_memcpy(cp_dst, cp_src, 4 * numLoadedElm);
+                        cp_dst += numLoadedElm;
+                    }
+                    cp_src += numLoadedElm;
+                }
+            }
+            return cp_src;
+        }
+    }
+    else{
+        if(step_id > 0){
+            for(step_itr=0; step_itr<step->repeat_operation; step_itr++){
+                src = broadcast_node_32(steps, step_id-1, dst, src);
+                cp_src = dst;
+                cp_dst = dst + numLoadedElm;
+                for(rep_itr=1; rep_itr<step->replicate_loadedElm_times; rep_itr++){
+                    for(k=0; k<(int)numLoadedElm; k++){
+                        cp_src_temp = cp_src;
+                        cp_dst_temp = cp_dst;
+                        cp_dst_temp[k] = cp_src_temp[k];
+                    }
+                    cp_dst += numLoadedElm;
+                }
+                dst = cp_dst;
+            }
+            return src;
+        } else {
+            if(numLoadedElm == 1){
+                for(j=0; j<step->repeat_operation; j++){
+//                    memset((void*)cp_dst, *(WORD32 *)cp_src, 4 * step->replicate_loadedElm_times);
+                	for(i = 0; i < step->replicate_loadedElm_times; i++)
+                		cp_dst[i] = cp_src[0];
+                    cp_dst += step->replicate_loadedElm_times;
+                    cp_src++;
+                }
+            } else {
+                for(j=0; j < step->repeat_operation; j++){
+                    for(i=0; i < step->replicate_loadedElm_times; i++){
+                        for(k=0; k<(int)(numLoadedElm); k++){
+                            cp_src_temp = cp_src;
+                            cp_dst_temp = cp_dst;
+                            cp_dst_temp[k] = cp_src_temp[k];
+
+                        }
+                        cp_dst += numLoadedElm;
+                    }
+                    cp_src += numLoadedElm;
+                }
+            }
+            return cp_src;
+        }
+    }
+}
diff --git a/backends/cadence/hifi/third-party/nnlib/xa_nn_broadcast_32_32.c b/backends/cadence/hifi/third-party/nnlib/xa_nn_broadcast_32_32.c
new file mode 100644
index 0000000000..34a7111ee7
--- /dev/null
+++ b/backends/cadence/hifi/third-party/nnlib/xa_nn_broadcast_32_32.c
@@ -0,0 +1,313 @@
+/*******************************************************************************
+* Copyright (c) 2018-2024 Cadence Design Systems, Inc.
+*
+* Permission is hereby granted, free of charge, to any person obtaining
+* a copy of this software and associated documentation files (the
+* "Software"), to use this Software with Cadence processor cores only and
+* not with any other processors and platforms, subject to
+* the following conditions:
+*
+* The above copyright notice and this permission notice shall be included
+* in all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+******************************************************************************/
+/*
+ * xa_nn_broadcast_32_32.c
+ */
+
+#include "xa_nnlib_common.h"
+//#include "xa_nn_basic_state.h"
+
+#include<string.h>
+#include<stdbool.h>
+
+#include "stdio.h"
+
+/*
+ * This file is sourced from ../hifi5/xa_nn_broadcast_8_8.c
+ */
+
+#define NUMDIMS_MAX 8
+
+typedef struct bcast_expansion_struct_{
+    size_t load_num_elem;
+    int    replicate_loadedElm_times;
+    int    repeat_operation;
+} bcast_expansion_rule ;
+
+WORD32* broadcast_node_32(bcast_expansion_rule *steps, unsigned int step_id,
+        WORD32 *dst, WORD32 *src);
+
+void *xa_nn_memcpy(void * dest1,const void *src1, size_t n1)
+{
+  char *dest = (char *)dest1;
+  char *src = (char *)src1;
+  int n = (int)n1;
+  ae_int16x4 * __restrict d_align_addr, * __restrict s_align_addr;
+  int i;
+  void *orig_dest = dest;
+
+  if (n < 32) {
+    return memcpy(dest, src, n);
+  }
+
+  if ( !(((int) dest) %8) && !(((int) src) %8)) { // 64-bit aligned
+    s_align_addr = (ae_int16x4 *) src;
+    d_align_addr = (ae_int16x4 *) dest;
+    for (i=0; i<n>>3; i++) {
+        d_align_addr[i] = s_align_addr[i];
+    }
+
+    for (i=(n&~7); i<n; i++) {
+      dest[i] = src[i];
+    }
+    return orig_dest;
+  }
+
+  if ( (((int) dest) %2) || (((int) src) %2)) { // 16-bit aligned
+    if ( (((int) dest) %2) && (((int) src) %2)) { // 16-bit aligned
+      *dest++ = *src++;
+       n--;
+    } else {
+      #if 0
+      return memcpy(dest, src, n);
+      #else
+        ae_int32x2 *pOut = (ae_int32x2 *)dest;
+        ae_int32x2 *pInp = (ae_int32x2 *)src;
+        ae_valign alignIn, alignOut;
+        alignIn = AE_LA64_PP(pInp);
+        alignOut = AE_ZALIGN64();
+        ae_int24x2 d0;
+        int Nby6 =  AE_MOVAD32_H(AE_MOVINT32X2_FROMINT64(AE_MUL32_LL(n, 0x2AAAAAAB)));
+        int remainder_start = 6*Nby6;
+
+        for(i=0;i<Nby6;i++)
+        {
+          AE_LA24X2_IP(d0, alignIn, pInp);
+          AE_SA24X2_IP(d0, alignOut, pOut);
+        }
+        AE_SA64POS_FP(alignOut, pOut);
+        /* remainder loop */
+        for(i=remainder_start; i < n; i++){
+          dest[i] = src[i];
+      }
+      return orig_dest;
+      #endif
+    }
+  }
+  int n2 = n/2;
+  ae_valign d_align = AE_ZALIGN64();
+  d_align_addr = (ae_int16x4 *) dest;
+  s_align_addr = (ae_int16x4 *) src;
+  ae_valign s_align = AE_LA64_PP(s_align_addr);
+  ae_int16x4 t,t2;
+  for (i=0; i<n2>>3; i++) {
+      AE_LA16X4_IP(t, s_align, s_align_addr);
+      AE_LA16X4_IP(t2, s_align, s_align_addr);
+      AE_SA16X4_IP(t, d_align, d_align_addr);
+      AE_SA16X4_IP(t2, d_align, d_align_addr);
+  }
+  AE_SA64POS_FP(d_align, d_align_addr);
+  ae_int16 *s_src = (ae_int16 *) src;
+  ae_int16 *s_dest = (ae_int16 *) dest;
+  for (i=8*i; i<n2; i++) {
+    s_dest[i] = s_src[i];
+  }
+  if (n % 2) {
+    dest[n-1] = src[n-1];
+  }
+  return orig_dest;
+} /* xa_nn_memcpy */
+
+WORD32 xa_nn_broadcast_32_32( WORD32* __restrict__ p_out,      /* pointer to write broadcasted output data to */
+        const int *const out_shape,         /* output shape resulting after broadcast */
+
+        WORD32* __restrict__ p_in,    /* pointer to unextended input data */
+        const int * const in_shape,         /* input shape */
+        int num_dims)
+{
+
+    /* NULL pointer checks */
+    XA_NNLIB_ARG_CHK_PTR(p_out, -1);
+    XA_NNLIB_ARG_CHK_PTR(out_shape, -1);
+    XA_NNLIB_ARG_CHK_PTR(p_in, -1);
+    XA_NNLIB_ARG_CHK_PTR(in_shape, -1);
+
+    /* IO shape pointer alignment checks */
+    XA_NNLIB_ARG_CHK_ALIGN(in_shape, sizeof(WORD32), -1);
+    XA_NNLIB_ARG_CHK_ALIGN(out_shape, sizeof(WORD32), -1);
+
+    /* Check if number of dims is valid */
+    XA_NNLIB_ARG_CHK_COND(num_dims<=0 || num_dims>8, -1);
+
+    int i = 0;
+
+    /* Check for valid IO shapes */
+    for(i=0; i<num_dims; i++){
+        XA_NNLIB_CHK_COND(in_shape[i]<=0, -1);
+        XA_NNLIB_CHK_COND(out_shape[i]<=0, -1);
+    }
+
+    /* Check if input shape can be broadcasted to requested output shape */
+    for(i=0; i<num_dims; i++){
+        if(in_shape[i] != out_shape[i]){
+            /* in_shape is either same as out_shape or 1 */
+            XA_NNLIB_CHK_COND( in_shape[i] != 1, -1);
+        }
+    }
+
+    /* bcast_expansion_steps contains a sequence to steps execute for a broadcast op */
+    bcast_expansion_rule bcast_expansion_steps[NUMDIMS_MAX] = {{0}};
+
+    int k=0;
+    int dim=0;
+    const void *res=0;
+
+    int num_elem_load = 1;
+    int num_copy_times = 1;
+    int num_repeat = 1;
+
+    dim = num_dims-1;
+    while(dim>=0){
+
+        /* Find the sub-matrix size */
+        while(in_shape[dim] != 1 && dim>=0){
+            num_elem_load *= out_shape[dim];
+            dim--;
+        }
+
+        /* Find the number of times this sub-matrix needs to be copied */
+        num_copy_times = 1;
+        while(in_shape[dim] == 1 && dim>=0){
+            num_copy_times *= out_shape[dim];
+            dim--;
+        }
+
+        /* Find the number of times the above copy needs to be repeated */
+        num_repeat = 1;
+        while(in_shape[dim] != 1 && dim>=0){
+            num_repeat *= 1 * out_shape[dim];
+            dim--;
+        }
+
+        bcast_expansion_steps[k].load_num_elem  = num_elem_load;
+        bcast_expansion_steps[k].replicate_loadedElm_times = num_copy_times;
+        bcast_expansion_steps[k].repeat_operation = num_repeat;
+        k++;
+
+        num_elem_load = num_elem_load * num_copy_times * num_repeat;
+    }
+
+    res = broadcast_node_32(bcast_expansion_steps, num_dims-1,
+            p_out, p_in);
+    (void)res; /* Unused return value */
+
+    return 0;
+}
+
+WORD32* broadcast_node_32(bcast_expansion_rule *steps, unsigned int step_id,
+        WORD32 *dst, WORD32 *src) {
+    int step_itr=0, rep_itr=0;
+    int i=0, j=0, k=0;
+    bcast_expansion_rule *step = NULL;
+
+    // ignore steps that are null
+    while(steps[step_id].repeat_operation == 0 && step_id>0){
+        step_id--;
+    }
+
+    // step is now the parent node for this iteration
+    step = &steps[step_id];
+    size_t numLoadedElm = step->load_num_elem;
+
+    WORD32 *cp_dst = dst;
+    WORD32 *cp_src = src;
+    WORD32 *cp_src_temp=NULL;
+    WORD32 *cp_dst_temp=NULL;
+
+    if(numLoadedElm>32){
+        if(step_id > 0){
+            for(step_itr=0; step_itr<step->repeat_operation; step_itr++){
+                src = broadcast_node_32(steps, step_id-1, dst, src);
+                cp_src = dst;
+                cp_dst = dst + numLoadedElm;
+                for(rep_itr=1; rep_itr<step->replicate_loadedElm_times; rep_itr++){
+                    xa_nn_memcpy(cp_dst, cp_src, 4 * numLoadedElm);
+                    cp_dst += numLoadedElm;
+                }
+                dst = cp_dst;
+            }
+            return src;
+        } else {
+            if(numLoadedElm == 1){
+                for(j=0; j<step->repeat_operation; j++){
+//                    memset((void*)cp_dst, (void*)cp_src, 4 * step->replicate_loadedElm_times);
+                	for(i = 0; i < step->replicate_loadedElm_times; i++)
+                		cp_dst[i] = cp_src[0];
+                    cp_dst += step->replicate_loadedElm_times;
+                    cp_src++;
+                }
+            } else {
+                for(j=0; j<step->repeat_operation; j++){
+                    for(i=0; i<step->replicate_loadedElm_times; i++){
+                        xa_nn_memcpy(cp_dst, cp_src, 4 * numLoadedElm);
+                        cp_dst += numLoadedElm;
+                    }
+                    cp_src += numLoadedElm;
+                }
+            }
+            return cp_src;
+        }
+    }
+    else{
+        if(step_id > 0){
+            for(step_itr=0; step_itr<step->repeat_operation; step_itr++){
+                src = broadcast_node_32(steps, step_id-1, dst, src);
+                cp_src = dst;
+                cp_dst = dst + numLoadedElm;
+                for(rep_itr=1; rep_itr<step->replicate_loadedElm_times; rep_itr++){
+                    for(k=0; k<(int)numLoadedElm; k++){
+                        cp_src_temp = cp_src;
+                        cp_dst_temp = cp_dst;
+                        cp_dst_temp[k] = cp_src_temp[k];
+                    }
+                    cp_dst += numLoadedElm;
+                }
+                dst = cp_dst;
+            }
+            return src;
+        } else {
+            if(numLoadedElm == 1){
+                for(j=0; j<step->repeat_operation; j++){
+//                    memset((void*)cp_dst, *(WORD32 *)cp_src, 4 * step->replicate_loadedElm_times);
+                	for(i = 0; i < step->replicate_loadedElm_times; i++)
+                		cp_dst[i] = cp_src[0];
+                    cp_dst += step->replicate_loadedElm_times;
+                    cp_src++;
+                }
+            } else {
+                for(j=0; j < step->repeat_operation; j++){
+                    for(i=0; i < step->replicate_loadedElm_times; i++){
+                        for(k=0; k<(int)(numLoadedElm); k++){
+                            cp_src_temp = cp_src;
+                            cp_dst_temp = cp_dst;
+                            cp_dst_temp[k] = cp_src_temp[k];
+
+                        }
+                        cp_dst += numLoadedElm;
+                    }
+                    cp_src += numLoadedElm;
+                }
+            }
+            return cp_src;
+        }
+    }
+}
diff --git a/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_minimum_maximum_f32.c b/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_minimum_maximum_f32.c
new file mode 100644
index 0000000000..3af93fc00c
--- /dev/null
+++ b/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_minimum_maximum_f32.c
@@ -0,0 +1,847 @@
+/*******************************************************************************
+* Copyright (c) 2018-2024 Cadence Design Systems, Inc.
+*
+* Permission is hereby granted, free of charge, to any person obtaining
+* a copy of this software and associated documentation files (the
+* "Software"), to use this Software with Cadence processor cores only and
+* not with any other processors and platforms, subject to
+* the following conditions:
+*
+* The above copyright notice and this permission notice shall be included
+* in all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+******************************************************************************/
+#include "nnlib-hifi4/xa_nnlib/include/xa_type_def.h"
+#include "nnlib-hifi4/xa_nnlib/algo/common/include/xa_nnlib_common_fpu.h"
+#include "nnlib-hifi4/xa_nnlib/algo/common/include/xa_nn_common.h"
+#include "nnlib-hifi4/xa_nnlib/algo/common/include/xa_nnlib_err_chk.h"
+#include "nnlib-hifi4/xa_nnlib/algo/kernels/basic/hifi4/xa_nn_basic_state.h"
+#include "nnlib-hifi4/xa_nnlib/include/nnlib/xa_nnlib_kernels_api.h"
+
+#if !HAVE_VFPU
+DISCARD_FUN_FOR_NONVOID_RETURN(
+             WORD32, xa_nn_elm_maximum_f32xf32_f32,
+             (
+                FLOAT32 *p_out,
+                const FLOAT32 *p_inp1,
+                const FLOAT32 *p_inp2,
+                WORD32 num_elm
+              )
+           )
+#else
+WORD32 xa_nn_elm_maximum_f32xf32_f32(FLOAT32 * __restrict__ p_out,
+                               const FLOAT32 * __restrict__ p_inp1,
+                               const FLOAT32 * __restrict__ p_inp2,
+                               WORD32 num_elm)
+{
+
+    /* NULL pointer checks */
+    XA_NNLIB_ARG_CHK_PTR(p_out, -1);
+    XA_NNLIB_ARG_CHK_PTR(p_inp1, -1);
+    XA_NNLIB_ARG_CHK_PTR(p_inp2, -1);
+    /* Pointer alignment checks */
+    XA_NNLIB_ARG_CHK_ALIGN(p_out, sizeof(FLOAT32), -1);
+    XA_NNLIB_ARG_CHK_ALIGN(p_inp1, sizeof(FLOAT32), -1);
+    XA_NNLIB_ARG_CHK_ALIGN(p_inp2, sizeof(FLOAT32), -1);
+    /* Basic Parameter checks */
+    XA_NNLIB_ARG_CHK_COND((num_elm <= 0), -1);
+
+    int i;
+    xtfloatx2 *inp1 = (xtfloatx2 *)p_inp1;
+    xtfloatx2 *inp2 = (xtfloatx2 *)p_inp2;
+    xtfloatx2 *out =  (xtfloatx2 *)p_out;
+    xtfloatx2 x1, x2, y;
+    unsigned char con1, con2;
+    xtbool2 con = int32_rtor_xtbool2(0x00000003);
+
+    if(((((unsigned)p_out)&7) == 0) && ((((unsigned)p_inp1)&7) == 0) && ((((unsigned)p_inp2)&7) == 0))
+    {
+        for(i=0;i < num_elm>>1;i++)
+        {
+            XT_LSX2IP(x1, inp1, 2*sizeof(FLOAT32));
+            XT_LSX2IP(x2, inp2, 2*sizeof(FLOAT32));
+            y = XT_MAX_SX2(x2, x1);
+            XT_SSX2IP( y, out,  2*sizeof(FLOAT32));
+        }
+    }
+    else
+    {
+        ae_valign inp1_a, inp2_a, out_a;
+
+        inp1_a = XT_LASX2PP(inp1);
+        inp2_a = XT_LASX2PP(inp2);
+        out_a = AE_ZALIGN64();
+        /* Each iteration of loop is independent so safe to use concurrent pragma */
+#pragma concurrent
+        for(i=0;i < num_elm>>1;i++)
+        {
+            XT_LASX2IP(x1, inp1_a, inp1);
+            XT_LASX2IP(x2, inp2_a, inp2);
+            y = XT_MAX_SX2(x2, x1);
+            XT_SASX2IP(y, out_a, out);
+        }
+        XT_SASX2POSFP(out_a, out);
+    }
+    // Remainder Loop
+    if (num_elm & 1)
+    {
+        xtfloat a1, a2, a;
+        XT_LSIP(a1, (xtfloat *)inp1, 0);
+        XT_LSIP(a2, (xtfloat *)inp2, 0);
+        a = XT_MAX_S(a1, a2);   
+        XT_SSI(a, (xtfloat *)out, 0);
+    }
+    return 0;
+}
+#endif
+
+#if HAVE_VFPU
+static void internal_elm_maximum_broadcast_2D_f32xf32_f32(FLOAT32 * __restrict__ p_out,
+                    const    FLOAT32 * __restrict__ p_inp1,
+                    const    FLOAT32 * __restrict__ p_inp2,
+                             WORD32  out_lc,
+                             WORD32  in_lc,
+                             xtbool  sign_flag)
+{
+  int i, j;
+
+  xtfloatx2  * __restrict__ p_a = (xtfloatx2 *)p_inp1;
+  xtfloatx2  * __restrict__ p_b = (xtfloatx2 *)p_inp2; 
+  xtfloatx2  *__restrict__  p_c =  (xtfloatx2 *)p_out;
+
+  int num_simd2_ops;
+  int num_scalar_ops;
+
+  if(out_lc)
+  {
+    num_simd2_ops = in_lc >> 1;
+    num_scalar_ops = in_lc & 1;
+  }
+  else
+  {
+    num_simd2_ops = (in_lc >> 2) << 1;
+    num_scalar_ops = in_lc & 3;
+  }
+
+    xtfloatx2 x1, x2, y;
+    xtfloat a0, b0, c0;
+ 
+  for(i = 0; i < out_lc; i++)
+  {
+    p_a = (xtfloatx2 *)&p_inp1[i * in_lc];
+    p_b = (xtfloatx2 *)p_inp2;
+    p_c = (xtfloatx2 *)&p_out[i * in_lc];
+    if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_b)&7) == 0) && ((((unsigned)p_c)&7) == 0))
+    {
+      for(j = 0; j < num_simd2_ops; j++)
+      {
+        XT_LSX2IP(x1, p_a, 2 * sizeof(FLOAT32));
+        XT_LSX2IP(x2, p_b, 2 * sizeof(FLOAT32));
+        y = XT_MAX_SX2(x2, x1);
+        XT_SSX2IP(y, p_c, 2 * sizeof(FLOAT32)); 
+      }
+    }
+    else
+    {
+      ae_valign vinp1, vinp2, out_a = AE_ZALIGN64();
+      vinp1 = XT_LASX2PP(p_a);
+      vinp2 = XT_LASX2PP(p_b);
+      for(j = 0; j < num_simd2_ops; j++)
+      {
+        XT_LASX2IP(x1, vinp1, p_a);
+        XT_LASX2IP(x2, vinp2, p_b);
+        y = XT_MAX_SX2(x2, x1);
+        XT_SASX2IP(y, out_a, p_c); 
+      }
+      XT_SASX2POSFP(out_a, (xtfloatx2 *)p_c);
+    }
+    if(num_scalar_ops !=0)
+    {
+      XT_LSIP(a0, (xtfloat *)p_a, sizeof(FLOAT32));
+      XT_LSIP(b0, (xtfloat *)p_b, sizeof(FLOAT32));
+      c0 = XT_MAX_S(b0, a0);   
+      XT_SSI(c0, (xtfloat *)p_c, 0);
+    }
+  }
+}
+
+static void internal_elm_maximum_broadcast_f32xf32_f32(FLOAT32 * __restrict__ p_out,
+                    const    FLOAT32 * __restrict__ p_inp1,
+                    const    FLOAT32 * __restrict__ p_inp2,
+                             WORD32  num_elm,
+                             xtbool  sign_flag)
+{
+  int i;
+  xtfloatx2  * __restrict__ p_a = (xtfloatx2 *)p_inp1;
+  xtfloatx2  * __restrict__ p_b = (xtfloatx2 *)p_inp2; 
+  xtfloatx2  *__restrict__  p_c =  (xtfloatx2 *)p_out;
+
+  const int num_simd2_ops = num_elm >> 1;
+  const int num_scalar_ops = num_elm & 1;
+
+  xtfloat a0_7, out;
+  xtfloatx2 x1, x2, y;
+  x2 = XT_LSI((xtfloat *)p_b, 0);
+        
+  if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_c)&7) == 0))
+  {
+    for(i=0; i<num_simd2_ops; i++)
+    {
+      XT_LSX2IP(x1, p_a, 2 * sizeof(FLOAT32));
+      y = XT_MAX_SX2(x2, x1);
+      XT_SSX2IP(y, p_c, 2 * sizeof(FLOAT32)); 
+    }
+  }
+  else
+  {
+    ae_valign inp1_a, out_a;
+    inp1_a = XT_LASX2PP(p_a);
+    out_a = AE_ZALIGN64();      
+    for(i=0; i<num_simd2_ops; i++)
+    {
+      XT_LASX2IP(x1, inp1_a, p_a);
+      y = XT_MAX_SX2(x2, x1);
+      XT_SASX2IP(y, out_a, p_c);
+    }
+    XT_SASX2POSFP(out_a, (xtfloatx2 *)p_c);   
+  }  
+  if(num_scalar_ops !=0)
+  {
+    XT_LSIP(a0_7, (xtfloat *)p_a, sizeof(FLOAT32));
+    out = XT_MAX_S(x2, a0_7);   
+    XT_SSI(out, (xtfloat *)p_c, 0);
+  }
+}
+#endif
+
+#if !HAVE_VFPU
+DISCARD_FUN_FOR_NONVOID_RETURN(
+             WORD32, xa_nn_elm_maximum_broadcast_4D_f32xf32_f32,
+             (
+                      FLOAT32 * p_out,
+                      const WORD32 *const p_out_shape,
+                      const FLOAT32 * p_inp1,
+                      const WORD32 *const p_inp1_shape,
+                      const FLOAT32 * p_inp2,
+                      const WORD32 *const p_inp2_shape
+              )
+           )
+#else           
+WORD32 xa_nn_elm_maximum_broadcast_4D_f32xf32_f32(FLOAT32 * __restrict__ p_out,
+                      const WORD32 *const p_out_shape,
+                      const FLOAT32 * __restrict__ p_inp1,
+                      const WORD32 *const p_inp1_shape,
+                      const FLOAT32 * __restrict__ p_inp2,
+                      const WORD32 *const p_inp2_shape)
+{
+  /* NULL pointer checks */
+  XA_NNLIB_ARG_CHK_PTR(p_out, -1);
+  XA_NNLIB_ARG_CHK_PTR(p_inp1, -1);
+  XA_NNLIB_ARG_CHK_PTR(p_inp2, -1);
+  XA_NNLIB_ARG_CHK_PTR(p_out_shape, -1);
+  XA_NNLIB_ARG_CHK_PTR(p_inp1_shape, -1);
+  XA_NNLIB_ARG_CHK_PTR(p_inp2_shape, -1);
+  /* Pointer alignment checks */
+  XA_NNLIB_ARG_CHK_ALIGN(p_out, sizeof(FLOAT32), -1);
+  XA_NNLIB_ARG_CHK_ALIGN(p_inp1, sizeof(FLOAT32), -1);
+  XA_NNLIB_ARG_CHK_ALIGN(p_inp2, sizeof(FLOAT32), -1);
+  XA_NNLIB_ARG_CHK_ALIGN(p_out_shape, sizeof(WORD32), -1);
+  XA_NNLIB_ARG_CHK_ALIGN(p_inp1_shape, sizeof(WORD32), -1);
+  XA_NNLIB_ARG_CHK_ALIGN(p_inp2_shape, sizeof(WORD32), -1);
+
+  /* Check shapes */
+  int i;
+  xtbool sign_flag;
+  for(i = 0; i < 4; i++)
+  {
+    if((p_inp1_shape[i] != p_inp2_shape[i] && p_inp1_shape[i] != 1 && p_inp2_shape[i] != 1) ||
+       (p_out_shape[i] != (p_inp1_shape[i] > p_inp2_shape[i] ? p_inp1_shape[i] : p_inp2_shape[i])))
+    {
+      return -1;
+    }
+  }
+
+  WORD32 inp1_strides[4], inp2_strides[4];
+  inp1_strides[3] = 1;
+  inp2_strides[3] = 1;
+  for(i = 2; i >= 0; i--)
+  {
+    ae_int32x2 d_str, d_shape;
+    d_str = AE_MOVDA32X2(inp1_strides[i + 1], inp2_strides[i + 1]);
+    d_shape = AE_MOVDA32X2(p_inp1_shape[i + 1], p_inp2_shape[i + 1]);
+    d_str = AE_MULP32X2(d_str, d_shape);
+    inp1_strides[i] = AE_MOVAD32_H(d_str);
+    inp2_strides[i] = AE_MOVAD32_L(d_str);
+  }
+
+  int need_broadcast = 0;
+  int inp1_const = 1, inp2_const = 1;
+  for(i = 0; i < 4; i++)
+  {
+    if(p_inp1_shape[i] != p_inp2_shape[i])
+    {
+      if(p_inp1_shape[i] == 1)
+        inp1_strides[i] = 0;
+      else
+        inp2_strides[i] = 0;
+
+      need_broadcast = 1;
+    }
+    if(p_inp1_shape[i] != 1)
+      inp1_const &= 0;
+    if(p_inp2_shape[i] != 1)
+      inp2_const &= 0;
+  }
+  int itr0, itr1, itr2;
+
+  FLOAT32 *p_out_tmp = p_out;
+  const FLOAT32 *__restrict__ p_inp1_tmp = p_inp1;
+  const FLOAT32 *__restrict__ p_inp2_tmp = p_inp2;
+  if(need_broadcast == 0)
+  {
+    sign_flag = 0;
+    internal_elm_maximum_broadcast_2D_f32xf32_f32(
+                p_out,
+                p_inp1,
+                p_inp2,
+                1,
+                p_out_shape[0] * inp1_strides[0],
+                sign_flag);
+  }
+  else if(inp1_strides[3] == inp2_strides[3])
+  {
+    WORD32 in_lc, out_lc;
+    sign_flag = 0;
+    in_lc = p_out_shape[2] * p_out_shape[3];
+    out_lc = 1;
+    if(inp1_strides[2] == 0)
+    {
+      const FLOAT32 *tmp;
+      tmp = p_inp1_tmp;   p_inp1_tmp = p_inp2_tmp;    p_inp2_tmp = tmp;
+      sign_flag = 1;
+      int tmp_strides[2];
+      tmp_strides[0] = inp1_strides[0];
+      tmp_strides[1] = inp1_strides[1];
+
+      inp1_strides[0] = inp2_strides[0];
+      inp1_strides[1] = inp2_strides[1];
+
+      inp2_strides[0] = tmp_strides[0];
+      inp2_strides[1] = tmp_strides[1];
+      in_lc = p_out_shape[3];
+      out_lc = p_out_shape[2];
+    }
+    else if(inp2_strides[2] == 0)
+    {
+      in_lc = p_out_shape[3];
+      out_lc = p_out_shape[2];
+    }
+
+    for(itr0 = 0; itr0 < p_out_shape[0]; itr0++)
+    {
+      const FLOAT32 *__restrict__ p_inp1_tmp0 = p_inp1_tmp;
+      const FLOAT32 *__restrict__ p_inp2_tmp0 = p_inp2_tmp;
+      for(itr1 = 0; itr1 < p_out_shape[1]; itr1++)
+      {
+        internal_elm_maximum_broadcast_2D_f32xf32_f32(
+            p_out_tmp,
+            p_inp1_tmp0,
+            p_inp2_tmp0,
+            out_lc,
+            in_lc,
+            sign_flag);
+        p_out_tmp += in_lc * out_lc;
+        p_inp1_tmp0 += inp1_strides[1];
+        p_inp2_tmp0 += inp2_strides[1];
+      }
+      p_inp1_tmp += inp1_strides[0];
+      p_inp2_tmp += inp2_strides[0];
+    }
+  }
+  else if(inp1_const == 1 || inp2_const == 1)
+  {
+    sign_flag = 0;
+    if(inp1_strides[3] == 0)
+    {
+      sign_flag = 1;
+      const FLOAT32 *tmp;
+      tmp = p_inp1_tmp;   p_inp1_tmp = p_inp2_tmp;    p_inp2_tmp = tmp;
+    }
+    internal_elm_maximum_broadcast_f32xf32_f32(
+        p_out_tmp,
+        p_inp1_tmp,
+        p_inp2_tmp,
+        p_out_shape[0] * p_out_shape[1] * p_out_shape[2] * p_out_shape[3],
+        sign_flag);
+  }
+  else
+  {
+    sign_flag = 0;
+    if(inp1_strides[3] == 0)
+    {
+      const FLOAT32 *tmp;
+      tmp = p_inp1_tmp;   p_inp1_tmp = p_inp2_tmp;    p_inp2_tmp = tmp;
+      sign_flag = 1;
+      int tmp_strides[3];
+      tmp_strides[0] = inp1_strides[0];
+      tmp_strides[1] = inp1_strides[1];
+      tmp_strides[2] = inp1_strides[2];
+
+      inp1_strides[0] = inp2_strides[0];
+      inp1_strides[1] = inp2_strides[1];
+      inp1_strides[2] = inp2_strides[2];
+
+      inp2_strides[0] = tmp_strides[0];
+      inp2_strides[1] = tmp_strides[1];
+      inp2_strides[2] = tmp_strides[2];
+    }
+    for(itr0 = 0; itr0 < p_out_shape[0]; itr0++)
+    {
+      const FLOAT32 *__restrict__ p_inp1_tmp0 = p_inp1_tmp;
+      const FLOAT32 *__restrict__ p_inp2_tmp0 = p_inp2_tmp;
+      for(itr1 = 0; itr1 < p_out_shape[1]; itr1++)
+      {
+        const FLOAT32 *__restrict__ p_inp1_tmp1 = p_inp1_tmp0;
+        const FLOAT32 *__restrict__ p_inp2_tmp1 = p_inp2_tmp0;
+        for(itr2 = 0; itr2 < p_out_shape[2]; itr2++)
+        {
+          {
+            internal_elm_maximum_broadcast_f32xf32_f32(
+                p_out_tmp,
+                p_inp1_tmp1,
+                p_inp2_tmp1,
+                p_out_shape[3], 
+                sign_flag);
+          }
+          p_out_tmp += p_out_shape[3];
+          p_inp1_tmp1 += inp1_strides[2];
+          p_inp2_tmp1 += inp2_strides[2];
+        }
+        p_inp1_tmp0 += inp1_strides[1];
+        p_inp2_tmp0 += inp2_strides[1];
+      }
+      p_inp1_tmp += inp1_strides[0];
+      p_inp2_tmp += inp2_strides[0];
+    }
+  }
+  return 0;
+}
+#endif
+
+#if !HAVE_VFPU
+DISCARD_FUN_FOR_NONVOID_RETURN(
+             WORD32, xa_nn_elm_minimum_f32xf32_f32,
+             (
+                FLOAT32 *p_out,
+                const FLOAT32 *p_inp1,
+                const FLOAT32 *p_inp2,
+                WORD32 num_elm
+              )
+           )
+#else
+WORD32 xa_nn_elm_minimum_f32xf32_f32(FLOAT32 * __restrict__ p_out,
+                               const FLOAT32 * __restrict__ p_inp1,
+                               const FLOAT32 * __restrict__ p_inp2,
+                               WORD32 num_elm)
+{
+
+    /* NULL pointer checks */
+    XA_NNLIB_ARG_CHK_PTR(p_out, -1);
+    XA_NNLIB_ARG_CHK_PTR(p_inp1, -1);
+    XA_NNLIB_ARG_CHK_PTR(p_inp2, -1);
+    /* Pointer alignment checks */
+    XA_NNLIB_ARG_CHK_ALIGN(p_out, sizeof(FLOAT32), -1);
+    XA_NNLIB_ARG_CHK_ALIGN(p_inp1, sizeof(FLOAT32), -1);
+    XA_NNLIB_ARG_CHK_ALIGN(p_inp2, sizeof(FLOAT32), -1);
+    /* Basic Parameter checks */
+    XA_NNLIB_ARG_CHK_COND((num_elm <= 0), -1);
+
+    int i;
+    xtfloatx2 *inp1 = (xtfloatx2 *)p_inp1;
+    xtfloatx2 *inp2 = (xtfloatx2 *)p_inp2;
+    xtfloatx2 *out =  (xtfloatx2 *)p_out;
+    xtfloatx2 x1, x2, y;
+    unsigned char con1, con2;
+    xtbool2 con = int32_rtor_xtbool2(0x00000003);
+
+    if(((((unsigned)p_out)&7) == 0) && ((((unsigned)p_inp1)&7) == 0) && ((((unsigned)p_inp2)&7) == 0))
+    {
+        for(i=0;i < num_elm>>1;i++)
+        {
+            XT_LSX2IP(x1, inp1, 2*sizeof(FLOAT32));
+            XT_LSX2IP(x2, inp2, 2*sizeof(FLOAT32));
+            y = XT_MIN_SX2(x2, x1);
+            XT_SSX2IP( y, out,  2*sizeof(FLOAT32));
+        }
+    }
+    else
+    {
+        ae_valign inp1_a, inp2_a, out_a;
+
+        inp1_a = XT_LASX2PP(inp1);
+        inp2_a = XT_LASX2PP(inp2);
+        out_a = AE_ZALIGN64();
+        /* Each iteration of loop is independent so safe to use concurrent pragma */
+#pragma concurrent
+        for(i=0;i < num_elm>>1;i++)
+        {
+            XT_LASX2IP(x1, inp1_a, inp1);
+            XT_LASX2IP(x2, inp2_a, inp2);
+            y = XT_MIN_SX2(x2, x1);
+            XT_SASX2IP(y, out_a, out);
+        }
+        XT_SASX2POSFP(out_a, out);
+    }
+    // Remainder Loop
+    if (num_elm & 1)
+    {
+        xtfloat a1, a2, a;
+        XT_LSIP(a1, (xtfloat *)inp1, 0);
+        XT_LSIP(a2, (xtfloat *)inp2, 0);
+        a = XT_MIN_S(a1, a2);   
+        XT_SSI(a, (xtfloat *)out, 0);
+    }
+    return 0;
+}
+#endif
+
+#if HAVE_VFPU
+static void internal_elm_minimum_broadcast_2D_f32xf32_f32(FLOAT32 * __restrict__ p_out,
+                    const    FLOAT32 * __restrict__ p_inp1,
+                    const    FLOAT32 * __restrict__ p_inp2,
+                             WORD32  out_lc,
+                             WORD32  in_lc,
+                             xtbool  sign_flag)
+{
+  int i, j;
+
+  xtfloatx2  * __restrict__ p_a = (xtfloatx2 *)p_inp1;
+  xtfloatx2  * __restrict__ p_b = (xtfloatx2 *)p_inp2; 
+  xtfloatx2  *__restrict__  p_c =  (xtfloatx2 *)p_out;
+
+  int num_simd2_ops;
+  int num_scalar_ops;
+
+  if(out_lc)
+  {
+    num_simd2_ops = in_lc >> 1;
+    num_scalar_ops = in_lc & 1;
+  }
+  else
+  {
+    num_simd2_ops = (in_lc >> 2) << 1;
+    num_scalar_ops = in_lc & 3;
+  }
+
+    xtfloatx2 x1, x2, y;
+    xtfloat a0, b0, c0;
+ 
+  for(i = 0; i < out_lc; i++)
+  {
+    p_a = (xtfloatx2 *)&p_inp1[i * in_lc];
+    p_b = (xtfloatx2 *)p_inp2;
+    p_c = (xtfloatx2 *)&p_out[i * in_lc];
+    if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_b)&7) == 0) && ((((unsigned)p_c)&7) == 0))
+    {
+      for(j = 0; j < num_simd2_ops; j++)
+      {
+        XT_LSX2IP(x1, p_a, 2 * sizeof(FLOAT32));
+        XT_LSX2IP(x2, p_b, 2 * sizeof(FLOAT32));
+        y = XT_MIN_SX2(x2, x1);
+        XT_SSX2IP(y, p_c, 2 * sizeof(FLOAT32)); 
+      }
+    }
+    else
+    {
+      ae_valign vinp1, vinp2, out_a = AE_ZALIGN64();
+      vinp1 = XT_LASX2PP(p_a);
+      vinp2 = XT_LASX2PP(p_b);
+      for(j = 0; j < num_simd2_ops; j++)
+      {
+        XT_LASX2IP(x1, vinp1, p_a);
+        XT_LASX2IP(x2, vinp2, p_b);
+        y = XT_MIN_SX2(x2, x1);
+        XT_SASX2IP(y, out_a, p_c); 
+      }
+      XT_SASX2POSFP(out_a, (xtfloatx2 *)p_c);
+    }
+    if(num_scalar_ops !=0)
+    {
+      XT_LSIP(a0, (xtfloat *)p_a, sizeof(FLOAT32));
+      XT_LSIP(b0, (xtfloat *)p_b, sizeof(FLOAT32));
+      c0 = XT_MIN_S(b0, a0);   
+      XT_SSI(c0, (xtfloat *)p_c, 0);
+    }
+  }
+}
+
+static void internal_elm_minimum_broadcast_f32xf32_f32(FLOAT32 * __restrict__ p_out,
+                    const    FLOAT32 * __restrict__ p_inp1,
+                    const    FLOAT32 * __restrict__ p_inp2,
+                             WORD32  num_elm,
+                             xtbool  sign_flag)
+{
+  int i;
+  xtfloatx2  * __restrict__ p_a = (xtfloatx2 *)p_inp1;
+  xtfloatx2  * __restrict__ p_b = (xtfloatx2 *)p_inp2; 
+  xtfloatx2  *__restrict__  p_c =  (xtfloatx2 *)p_out;
+
+  const int num_simd2_ops = num_elm >> 1;
+  const int num_scalar_ops = num_elm & 1;
+
+  xtfloat a0_7, out;
+  xtfloatx2 x1, x2, y;
+  x2 = XT_LSI((xtfloat *)p_b, 0);
+        
+  if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_c)&7) == 0))
+  {
+    for(i=0; i<num_simd2_ops; i++)
+    {
+      XT_LSX2IP(x1, p_a, 2 * sizeof(FLOAT32));
+      y = XT_MIN_SX2(x2, x1);
+      XT_SSX2IP(y, p_c, 2 * sizeof(FLOAT32)); 
+    }
+  }
+  else
+  {
+    ae_valign inp1_a, out_a;
+    inp1_a = XT_LASX2PP(p_a);
+    out_a = AE_ZALIGN64();      
+    for(i=0; i<num_simd2_ops; i++)
+    {
+      XT_LASX2IP(x1, inp1_a, p_a);
+      y = XT_MIN_SX2(x2, x1);
+      XT_SASX2IP(y, out_a, p_c);
+    }
+    XT_SASX2POSFP(out_a, (xtfloatx2 *)p_c);   
+  }  
+  if(num_scalar_ops !=0)
+  {
+    XT_LSIP(a0_7, (xtfloat *)p_a, sizeof(FLOAT32));
+    out = XT_MIN_S(x2, a0_7);   
+    XT_SSI(out, (xtfloat *)p_c, 0);
+  }
+}
+#endif
+
+#if !HAVE_VFPU
+DISCARD_FUN_FOR_NONVOID_RETURN(
+             WORD32, xa_nn_elm_minimum_broadcast_4D_f32xf32_f32,
+             (
+                      FLOAT32 * p_out,
+                      const WORD32 *const p_out_shape,
+                      const FLOAT32 * p_inp1,
+                      const WORD32 *const p_inp1_shape,
+                      const FLOAT32 * p_inp2,
+                      const WORD32 *const p_inp2_shape
+              )
+           )
+#else           
+WORD32 xa_nn_elm_minimum_broadcast_4D_f32xf32_f32(FLOAT32 * __restrict__ p_out,
+                      const WORD32 *const p_out_shape,
+                      const FLOAT32 * __restrict__ p_inp1,
+                      const WORD32 *const p_inp1_shape,
+                      const FLOAT32 * __restrict__ p_inp2,
+                      const WORD32 *const p_inp2_shape)
+{
+  /* NULL pointer checks */
+  XA_NNLIB_ARG_CHK_PTR(p_out, -1);
+  XA_NNLIB_ARG_CHK_PTR(p_inp1, -1);
+  XA_NNLIB_ARG_CHK_PTR(p_inp2, -1);
+  XA_NNLIB_ARG_CHK_PTR(p_out_shape, -1);
+  XA_NNLIB_ARG_CHK_PTR(p_inp1_shape, -1);
+  XA_NNLIB_ARG_CHK_PTR(p_inp2_shape, -1);
+  /* Pointer alignment checks */
+  XA_NNLIB_ARG_CHK_ALIGN(p_out, sizeof(FLOAT32), -1);
+  XA_NNLIB_ARG_CHK_ALIGN(p_inp1, sizeof(FLOAT32), -1);
+  XA_NNLIB_ARG_CHK_ALIGN(p_inp2, sizeof(FLOAT32), -1);
+  XA_NNLIB_ARG_CHK_ALIGN(p_out_shape, sizeof(WORD32), -1);
+  XA_NNLIB_ARG_CHK_ALIGN(p_inp1_shape, sizeof(WORD32), -1);
+  XA_NNLIB_ARG_CHK_ALIGN(p_inp2_shape, sizeof(WORD32), -1);
+
+  /* Check shapes */
+  int i;
+  xtbool sign_flag;
+  for(i = 0; i < 4; i++)
+  {
+    if((p_inp1_shape[i] != p_inp2_shape[i] && p_inp1_shape[i] != 1 && p_inp2_shape[i] != 1) ||
+       (p_out_shape[i] != (p_inp1_shape[i] > p_inp2_shape[i] ? p_inp1_shape[i] : p_inp2_shape[i])))
+    {
+      return -1;
+    }
+  }
+
+  WORD32 inp1_strides[4], inp2_strides[4];
+  inp1_strides[3] = 1;
+  inp2_strides[3] = 1;
+  for(i = 2; i >= 0; i--)
+  {
+    ae_int32x2 d_str, d_shape;
+    d_str = AE_MOVDA32X2(inp1_strides[i + 1], inp2_strides[i + 1]);
+    d_shape = AE_MOVDA32X2(p_inp1_shape[i + 1], p_inp2_shape[i + 1]);
+    d_str = AE_MULP32X2(d_str, d_shape);
+    inp1_strides[i] = AE_MOVAD32_H(d_str);
+    inp2_strides[i] = AE_MOVAD32_L(d_str);
+  }
+
+  int need_broadcast = 0;
+  int inp1_const = 1, inp2_const = 1;
+  for(i = 0; i < 4; i++)
+  {
+    if(p_inp1_shape[i] != p_inp2_shape[i])
+    {
+      if(p_inp1_shape[i] == 1)
+        inp1_strides[i] = 0;
+      else
+        inp2_strides[i] = 0;
+
+      need_broadcast = 1;
+    }
+    if(p_inp1_shape[i] != 1)
+      inp1_const &= 0;
+    if(p_inp2_shape[i] != 1)
+      inp2_const &= 0;
+  }
+  int itr0, itr1, itr2;
+
+  FLOAT32 *p_out_tmp = p_out;
+  const FLOAT32 *__restrict__ p_inp1_tmp = p_inp1;
+  const FLOAT32 *__restrict__ p_inp2_tmp = p_inp2;
+  if(need_broadcast == 0)
+  {
+    sign_flag = 0;
+    internal_elm_minimum_broadcast_2D_f32xf32_f32(
+                p_out,
+                p_inp1,
+                p_inp2,
+                1,
+                p_out_shape[0] * inp1_strides[0],
+                sign_flag);
+  }
+  else if(inp1_strides[3] == inp2_strides[3])
+  {
+    WORD32 in_lc, out_lc;
+    sign_flag = 0;
+    in_lc = p_out_shape[2] * p_out_shape[3];
+    out_lc = 1;
+    if(inp1_strides[2] == 0)
+    {
+      const FLOAT32 *tmp;
+      tmp = p_inp1_tmp;   p_inp1_tmp = p_inp2_tmp;    p_inp2_tmp = tmp;
+      sign_flag = 1;
+      int tmp_strides[2];
+      tmp_strides[0] = inp1_strides[0];
+      tmp_strides[1] = inp1_strides[1];
+
+      inp1_strides[0] = inp2_strides[0];
+      inp1_strides[1] = inp2_strides[1];
+
+      inp2_strides[0] = tmp_strides[0];
+      inp2_strides[1] = tmp_strides[1];
+      in_lc = p_out_shape[3];
+      out_lc = p_out_shape[2];
+    }
+    else if(inp2_strides[2] == 0)
+    {
+      in_lc = p_out_shape[3];
+      out_lc = p_out_shape[2];
+    }
+
+    for(itr0 = 0; itr0 < p_out_shape[0]; itr0++)
+    {
+      const FLOAT32 *__restrict__ p_inp1_tmp0 = p_inp1_tmp;
+      const FLOAT32 *__restrict__ p_inp2_tmp0 = p_inp2_tmp;
+      for(itr1 = 0; itr1 < p_out_shape[1]; itr1++)
+      {
+        internal_elm_minimum_broadcast_2D_f32xf32_f32(
+            p_out_tmp,
+            p_inp1_tmp0,
+            p_inp2_tmp0,
+            out_lc,
+            in_lc,
+            sign_flag);
+        p_out_tmp += in_lc * out_lc;
+        p_inp1_tmp0 += inp1_strides[1];
+        p_inp2_tmp0 += inp2_strides[1];
+      }
+      p_inp1_tmp += inp1_strides[0];
+      p_inp2_tmp += inp2_strides[0];
+    }
+  }
+  else if(inp1_const == 1 || inp2_const == 1)
+  {
+    sign_flag = 0;
+    if(inp1_strides[3] == 0)
+    {
+      sign_flag = 1;
+      const FLOAT32 *tmp;
+      tmp = p_inp1_tmp;   p_inp1_tmp = p_inp2_tmp;    p_inp2_tmp = tmp;
+    }
+    internal_elm_minimum_broadcast_f32xf32_f32(
+        p_out_tmp,
+        p_inp1_tmp,
+        p_inp2_tmp,
+        p_out_shape[0] * p_out_shape[1] * p_out_shape[2] * p_out_shape[3],
+        sign_flag);
+  }
+  else
+  {
+    sign_flag = 0;
+    if(inp1_strides[3] == 0)
+    {
+      const FLOAT32 *tmp;
+      tmp = p_inp1_tmp;   p_inp1_tmp = p_inp2_tmp;    p_inp2_tmp = tmp;
+      sign_flag = 1;
+      int tmp_strides[3];
+      tmp_strides[0] = inp1_strides[0];
+      tmp_strides[1] = inp1_strides[1];
+      tmp_strides[2] = inp1_strides[2];
+
+      inp1_strides[0] = inp2_strides[0];
+      inp1_strides[1] = inp2_strides[1];
+      inp1_strides[2] = inp2_strides[2];
+
+      inp2_strides[0] = tmp_strides[0];
+      inp2_strides[1] = tmp_strides[1];
+      inp2_strides[2] = tmp_strides[2];
+    }
+    for(itr0 = 0; itr0 < p_out_shape[0]; itr0++)
+    {
+      const FLOAT32 *__restrict__ p_inp1_tmp0 = p_inp1_tmp;
+      const FLOAT32 *__restrict__ p_inp2_tmp0 = p_inp2_tmp;
+      for(itr1 = 0; itr1 < p_out_shape[1]; itr1++)
+      {
+        const FLOAT32 *__restrict__ p_inp1_tmp1 = p_inp1_tmp0;
+        const FLOAT32 *__restrict__ p_inp2_tmp1 = p_inp2_tmp0;
+        for(itr2 = 0; itr2 < p_out_shape[2]; itr2++)
+        {
+          {
+            internal_elm_minimum_broadcast_f32xf32_f32(
+                p_out_tmp,
+                p_inp1_tmp1,
+                p_inp2_tmp1,
+                p_out_shape[3], 
+                sign_flag);
+          }
+          p_out_tmp += p_out_shape[3];
+          p_inp1_tmp1 += inp1_strides[2];
+          p_inp2_tmp1 += inp2_strides[2];
+        }
+        p_inp1_tmp0 += inp1_strides[1];
+        p_inp2_tmp0 += inp2_strides[1];
+      }
+      p_inp1_tmp += inp1_strides[0];
+      p_inp2_tmp += inp2_strides[0];
+    }
+  }
+  return 0;
+}
+#endif
\ No newline at end of file
diff --git a/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_pow_f32.c b/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_pow_f32.c
new file mode 100644
index 0000000000..4dcec52f97
--- /dev/null
+++ b/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_pow_f32.c
@@ -0,0 +1,1151 @@
+/* ------------------------------------------------------------------------ */
+/* Copyright (c) 2018 by Cadence Design Systems, Inc. ALL RIGHTS RESERVED.  */
+/* These coded instructions, statements, and computer programs ("Cadence    */
+/* Libraries") are the copyrighted works of Cadence Design Systems Inc.	    */
+/* Cadence IP is licensed for use with Cadence processor cores only and     */
+/* must not be used for any other processors and platforms. Your use of the */
+/* Cadence Libraries is subject to the terms of the license agreement you   */
+/* have entered into with Cadence Design Systems, or a sublicense granted   */
+/* to you by a direct Cadence licensee.                                     */
+/* ------------------------------------------------------------------------ */
+/*  IntegrIT, Ltd.   www.integrIT.com, info@integrIT.com                    */
+/*                                                                          */
+/* DSP Library                                                              */
+/*                                                                          */
+/* This library contains copyrighted materials, trade secrets and other     */
+/* proprietary information of IntegrIT, Ltd. This software is licensed for  */
+/* use with Cadence processor cores only and must not be used for any other */
+/* processors and platforms. The license to use these sources was given to  */
+/* Cadence, Inc. under Terms and Condition of a Software License Agreement  */
+/* between Cadence, Inc. and IntegrIT, Ltd.                                 */ 
+/* ------------------------------------------------------------------------ */
+/*          Copyright (C) 2015-2018 IntegrIT, Limited.                      */
+/*                      All Rights Reserved.                                */
+/* ------------------------------------------------------------------------ */
+/*
+  NatureDSP Signal Processing Library. Vector mathematics
+    Vector operations
+    code optimized for HiFi4 core
+  IntegrIT, 2006-2018
+*/
+
+#include "../include/NatureDSP_Signal_math.h"
+#include "NatureDSP_types.h"
+#include "xa_nn_common.h"
+
+/* Common helper macros. */
+#include "xa_nnlib_common_fpu.h"
+
+#include "xa_nnlib_common.h"
+/* Constant tables. */
+
+const union ufloat32uint32 ALIGN(8) xa_nnlib_pow2f_coef[] =
+{
+  { 0x39222a65 },
+  { 0x3aaf931c },
+  { 0x3c1d94fc },
+  { 0x3d63578a },
+  { 0x3e75fdf0 },
+  { 0x3f317218 },
+  { 0x3f800000 }
+
+ //{ 0x3aaf931b },
+ //{ 0x3c1e7220 },
+ //{ 0x3d63578a },
+ //{ 0x3e75fcc9 },
+ //{ 0x3f317218 },
+ //{ 0x3f800000 }
+
+};
+
+const union ufloat32uint32 ALIGN(8) xa_nnlib_log2f_coef[] =
+{
+  { 0x3d726a49 },
+  { 0x3dd91c88 },
+  { 0x3ddde76c },
+  { 0x3de21e63 },
+  { 0x3dfe600b },
+  { 0x3e124679 },
+  { 0x3e2ab2f1 },
+  { 0x3e4ccd1b },
+  { 0x3e7fffde },
+  { 0x3eaaaaaa },
+  { 0x3f000000 },
+  { 0x3f800000 },
+  /* log2(e) */
+  { 0x3fb8aa3b }, /* 1.4426950216      */
+  { 0x32a57060 }  /* 1.9259629891e-008 */
+};
+
+const union ufloat32uint32 xa_nnlib_pow_plusInff  ={0x7f800000};
+
+const union ufloat32uint32 xa_nnlib_pow_qNaNf       = { 0x7fc00000 };
+
+#define MIN(a,b)   ( (a)<(b) ? (a) : (b) )
+#define MAX(a,b)   ( (a)>(b) ? (a) : (b) )
+
+/*-------------------------------------------------------------------------
+  Power function
+  These routines calculate power function for 32-bit fixed-point numbers or 
+  floating point numbers. 
+  For the fixed point API, The  base is represented in Q31, the exponent 
+  is represented in Q6.25. Results are represented as normalized fixed point
+  number with separate mantissa in Q31 and exponent.
+
+  Precision:
+  32x32  32-bit inputs, 32-bit outputs
+  f      floating point input, floating point output
+
+  Accuracy: 
+  2 ULP for fixed point API
+  2 ULP under condition that |y|<=100
+
+  Notes:
+1. Scalar floating point raise  to a power functions conform to ANSI C requirements on 
+   standard math library functions in respect to treatment of errno and floating-
+   point exceptions. Vectorized function does not touch errno and may raise or not raise 
+   floating point exceptions.
+2. For floating point API, If x<0 is finite, y is finite and not an integer value, 
+   then the respective result z is set to NaN
+3. For fixed point API, function returns zero for all non-positive x. Fixed point 
+   functions never touch errno
+
+    Special cases:
+          x   |   y    | Result |  Extra Conditions    
+      --------+--------+--------+---------------------
+      floating point API
+      --------+--------+--------+---------------------
+        +/-0  | y      | +/-inf | odd y<0
+        +/-0  | y      | +inf   | even y<0
+        +/-0  | y      | +/-0   | odd y>0
+        +/-0  | y      | 0      | even y>0
+        +/-1  | +/-inf | 1      | 
+        1     | y      | 1      | any y including NaN
+        x     | +/-0   | 1      | any x including NaN
+        x     | y      | NaN    | finite x<0 and finite 
+              |        |        | non-integer y (see 
+              |        |        | note 2)
+        x     | -inf   | +inf   | |x|<1
+        x     | -inf   | 0      | |x|>1
+        x     | +inf   | 0      | |x|<1
+        x     | +inf   | +inf   | |x|>1
+        -inf  | y      | -0     | y an odd integer <0
+        -inf  | y      | 0      | y<0 and not an odd 
+              |        |        | integer
+        -inf  | y      | -inf   | y an odd integer >0
+        -inf  | y      | +inf   | y>0 and not an odd 
+              |        |        | integer
+        +inf  | y      | 0      | y<0
+        +inf  | y      | +inf   | y>0
+      --------+--------+--------+---------------------
+      fixed point API
+      --------+--------+--------+---------------------
+         x    | y      | 0      | x<=0
+      --------+--------+--------+---------------------
+
+  Input:
+  x[N]  input data,Q0.31 or floating point
+  y[N]  input data,Q6.25 or floating point
+  N     length of vectors
+  Output (fixed point API):
+  m[N]  mantissa of output, Q31 
+  e[N]  exponent of output  
+  Output (floating point API):
+  z[N]  results: floating point
+
+  Restriction:
+  z,x,y,m should not overlap
+-------------------------------------------------------------------------*/
+
+#if !HAVE_VFPU && !HAVE_FPU
+DISCARD_FUN(void, xa_nn_elm_pow_f32, (FLOAT32 * restrict z, const FLOAT32 * restrict y, const FLOAT32 * restrict x, WORD32 N))
+#elif HAVE_VFPU
+#define sz_f32    (int)sizeof(FLOAT32)
+static void mypowf(FLOAT32 * scr,
+                  FLOAT32 * restrict z, 
+            const FLOAT32 * restrict x, 
+            const FLOAT32 * restrict y, 
+            WORD32 N )
+{
+  /* Table of different constants used in computations */
+  static const int32_t c_tbl[] =
+  {
+    -126,
+    -150,
+    (int32_t)0x007FFFFF,/* max denormalized floating-point number / mantissa mask */
+    (int32_t)0x4B800000,/* 2^24 */
+    (int32_t)0x3F3504F3,/* sqrt(0.5) */
+    (int32_t)0x3F000000,/*  0.5 */
+    (int32_t)0xBF000000,/* -0.5 */
+    -252,
+    254
+  };
+  int n;
+  const xtfloatx2     *          pX;
+  const xtfloatx2     *          pY;
+
+  const xtfloatx2     * restrict S_rd;
+        xtfloatx2     * restrict S_wr;
+        xtfloatx2     * restrict pZ;
+  const ae_int32      * restrict TBL;
+  const  xtfloat      * restrict TBL_LOG2;
+  const  xtfloat      * restrict TBL_POW2;
+  xtfloatx2 x0, y0, z0, t0, t1, ef0;
+  xtfloatx2 c2f, c3f, c4f;
+  xtfloatx2 _0, _1, half;
+  ae_int32x2 c0i, c1i, c5i, c7i, c8i;
+  ae_int32x2 e0, xi0, yi0, ex0;
+  xtbool2 bsx, bsy, bdenorm, bsmall;
+  ae_valign aX, aY, aZ;
+
+  /* overall number of blocks; number of values in the current block */
+  int blkLen;
+  /* Block size, blkLen <= blkSize */
+  const int blkSize = MAX_ALLOCA_SZ / (3*sz_f32);
+
+
+  if (N <= 0) return;
+
+  NASSERT(N % 2 == 0);
+  NASSERT_ALIGN16(scr);
+
+  /*
+  * Data are processed in blocks of scratch area size. Further, the algorithm
+  * implementation is splitted in order to feed the optimizing compiler with a
+  * few loops of managable size.
+  */
+
+
+  blkLen = 0;
+  TBL = (const ae_int32 *)c_tbl;
+  for (; N>0; N -= blkLen, x += blkSize, y += blkSize, z += blkSize)
+  {
+    blkLen = XT_MIN(N, blkSize);
+    _0 = 0.0f;
+    _1 = (1.0f);
+    half = (0.5f);
+    {
+      pX = (const xtfloatx2*)x;
+      S_wr = (xtfloatx2*)scr;
+      aX = AE_LA64_PP(pX);
+      for (n = 0; n<(blkLen >> 1); n++)
+      {
+        XT_LASX2IP(x0, aX, pX);
+
+        x0 = XT_ABS_SX2(x0);
+        c0i = AE_L32_I(TBL, 0 * 4); /*-126*/
+        c1i = AE_L32_I(TBL, 1 * 4); /*-150*/
+        c2f = XT_LSI((xtfloat*)TBL, 2 * 4);
+        c3f = XT_LSI((xtfloat*)TBL, 3 * 4);
+        /* process denormalized values */
+        bdenorm = XT_OLE_SX2(x0, c2f);
+        t0 = XT_MUL_SX2(x0, c3f);
+        XT_MOVT_SX2(x0, t0, bdenorm);
+        e0 = c0i;
+        AE_MOVT32X2(e0, c1i, bdenorm);
+        /* extract exponent */
+        xi0 = XT_AE_MOVINT32X2_FROMXTFLOATX2(x0);
+        ex0 = AE_SRLI32(xi0, 23);
+        e0 = AE_ADD32(e0, ex0);
+        /* extract mantissa */
+        ex0 = XT_AE_MOVINT32X2_FROMXTFLOATX2(c2f);/* load mantissa mask */ //!!!!!!!!!!!!!
+        c5i = AE_L32_I(TBL, 5 * 4);/*  0.5 */
+        xi0 = AE_AND32(xi0, ex0);
+        xi0 = AE_OR32(xi0, c5i);
+        x0 = XT_AE_MOVXTFLOATX2_FROMINT32X2(xi0);
+        /* adjust the mantissa to range [ sqrt(0.5) ; sqrt(2.0) ) */
+        c4f = XT_LSI((xtfloat*)TBL, 4 * 4);
+        bsmall = XT_OLT_SX2(x0, c4f);
+        t0 = XT_ADD_SX2(x0, x0);
+        ex0 = AE_SUB32(e0, 1);
+        XT_MOVT_SX2(x0, t0, bsmall);
+        AE_MOVT32X2(e0, ex0, bsmall);
+        x0 = XT_SUB_SX2(_1, x0); //!!!
+        ef0 = XT_FLOAT_SX2(e0, 0); //!!!
+        XT_SSX2IP(x0, S_wr, 2 * sz_f32);
+        XT_SSX2IP(ef0, S_wr, 2*2 * sz_f32);
+      }
+    }
+    __Pragma("no_reorder");
+    /* */
+    {
+      xtfloatx2 p0, p1, p2, p3, p4, p5, p6, p7, p8, p9;
+      xtfloatx2 p10, p11, p12, p13;
+      xtfloatx2 t2, w0, w1;
+      S_wr = (      xtfloatx2*)scr+2;
+      S_rd = (const xtfloatx2*)scr;
+      TBL_LOG2 = (const xtfloat *)xa_nnlib_log2f_coef;
+      for (n = 0; n<(blkLen >> 1); n++)
+      {
+        XT_LSX2IP(x0, S_rd, 3*2 * sz_f32);
+        //XT_LSX2IP(ef0, S_rd, 2 * sz_f32);
+
+        /* evaluate polynomial approximation */
+        /* Load table of coefficients */
+
+        p0 = XT_LSI(TBL_LOG2, 0 * 4);
+        p1 = XT_LSI(TBL_LOG2, 1 * 4);
+        p2 = XT_LSI(TBL_LOG2, 2 * 4);
+        p3 = XT_LSI(TBL_LOG2, 3 * 4);
+        p4 = XT_LSI(TBL_LOG2, 4 * 4);
+        p5 = XT_LSI(TBL_LOG2, 5 * 4);
+        p6 = XT_LSI(TBL_LOG2, 6 * 4);
+        p7 = XT_LSI(TBL_LOG2, 7 * 4);
+        p8 = XT_LSX(TBL_LOG2, 8 * 4);
+        p9 = XT_LSX(TBL_LOG2, 9 * 4);
+        
+        XT_MADD_SX2(p1, x0, p0);
+        XT_MADD_SX2(p2, x0, p1);
+        XT_MADD_SX2(p3, x0, p2);
+        XT_MADD_SX2(p4, x0, p3);
+        XT_MADD_SX2(p5, x0, p4);
+        XT_MADD_SX2(p6, x0, p5);
+        XT_MADD_SX2(p7, x0, p6);
+        XT_MADD_SX2(p8, x0, p7);
+        XT_MADD_SX2(p9, x0, p8);
+        t2 = p9;
+        XT_SSX2IP(t2, S_wr, 3*2 * sz_f32);
+      }
+      S_wr = (xtfloatx2*)scr;
+      S_rd = (const xtfloatx2*)scr;
+      for (n = 0; n<(blkLen >> 1); n++)
+      {
+        p10 = XT_LSX(TBL_LOG2, 10 * 4);
+        p11 = XT_LSX(TBL_LOG2, 11 * 4);
+        p12 = XT_LSX(TBL_LOG2, 12 * 4);
+        p13 = XT_LSX(TBL_LOG2, 13 * 4);
+
+        XT_LSX2IP(x0, S_rd, 2 * sz_f32);
+        XT_LSX2IP(ef0, S_rd, 2 * sz_f32);
+        XT_LSX2IP(t2, S_rd, 2 * sz_f32);
+        /* next coefficients are computed in extended precision */
+        t0 = XT_MUL_SX2(x0, t2); t1 = t0;
+        XT_MSUB_SX2(t1, x0, t2);
+        w0 = XT_ADD_SX2(t0, p10);
+        w1 = XT_SUB_SX2(w0, p10);
+        w1 = XT_SUB_SX2(t0, w1);
+        w1 = XT_SUB_SX2(w1, t1);
+        t0 = w0; t1 = w1;
+        w0 = XT_MUL_SX2(x0, t0); w1 = w0;
+        XT_MSUB_SX2(w1, x0, t0); t0 = w0;
+        XT_MSUB_SX2(w1, x0, t1); t1 = w1;
+        w0 = XT_ADD_SX2(t0, p11);
+        w1 = XT_SUB_SX2(w0, p11);
+        w1 = XT_SUB_SX2(t0, w1);
+        w1 = XT_SUB_SX2(w1, t1);
+        t0 = w0; t1 = w1;
+        x0 = XT_NEG_SX2(x0);
+        w0 = XT_MUL_SX2(x0, t0); w1 = w0;
+        XT_MSUB_SX2(w1, x0, t0); t0 = w0;
+        XT_MSUB_SX2(w1, x0, t1); t1 = w1;
+        /* multiply by log2(e) */
+        w0 = XT_MUL_SX2(t0, p12); w1 = w0;
+        XT_MSUB_SX2(w1, t0, p12);
+        XT_MADD_SX2(w1, t1, p12);
+        XT_MSUB_SX2(w1, t0, p13);
+        t0 = w0; t1 = w1;
+        /* add exponent */
+        w0 = XT_ADD_SX2(t0, ef0);
+        w1 = XT_SUB_SX2(w0, ef0);
+        w1 = XT_SUB_SX2(t0, w1);
+        t1 = XT_SUB_SX2(w1, t1);//!!!!
+        t0 = w0; // !!!!!
+        XT_SSX2IP(t0, S_wr, 2 * sz_f32);
+        XT_SSX2IP(t1, S_wr, 2*2 * sz_f32);
+      }    
+    }
+    __Pragma("no_reorder");
+    /* */
+    {
+      xtfloatx2 xy, dxy, c0, c1;
+      xtfloatx2 p0, p1, p2, p3, p4, p5, p6;
+      S_wr = (      xtfloatx2*)scr+2;
+      S_rd = (const xtfloatx2*)scr;
+      TBL_POW2 = (const xtfloat *)xa_nnlib_pow2f_coef;
+      pY = (const xtfloatx2*)y;
+      aY = AE_LA64_PP(pY);
+      for (n = 0; n<(blkLen >> 1); n++)
+      {
+        XT_LSX2IP(t0, S_rd, 2 * sz_f32);
+        XT_LSX2IP(t1, S_rd, 2*2 * sz_f32);
+
+        XT_LASX2IP(y0, aY, pY);
+        /* compute y*log2(x) and separate result into integer and fractional parts */
+        xy = XT_FIROUND_SX2(XT_MUL_SX2(y0, t0));
+        dxy = XT_NEG_SX2(xy);
+        XT_MADD_SX2(dxy, y0, t0);
+        XT_MADD_SX2(dxy, y0, t1);
+        dxy = XT_MIN_SX2(dxy, (xtfloatx2)1.0f);
+        dxy = XT_MAX_SX2(dxy, (xtfloatx2)-1.0f);
+        /* compute 2^fract */
+        p0 = XT_LSI(TBL_POW2, 0 * 4);
+        p1 = XT_LSI(TBL_POW2, 1 * 4);
+        p2 = XT_LSI(TBL_POW2, 2 * 4);
+        p3 = XT_LSI(TBL_POW2, 3 * 4);
+        p4 = XT_LSI(TBL_POW2, 4 * 4);
+        
+        /* NOTE: do not change the order of computations and way of polynomial decomposition ! */
+        XT_MADD_SX2(p1, dxy, p0);
+        XT_MADD_SX2(p2, dxy, p1);
+        XT_MADD_SX2(p3, dxy, p2);
+        XT_MADD_SX2(p4, dxy, p3);
+        XT_SSX2IP(p4, S_wr, 3*2 * sz_f32);
+      }
+      __Pragma("no_reorder");
+      S_wr = (xtfloatx2*)scr;
+      S_rd = (const xtfloatx2*)scr;
+      TBL_POW2 = (const xtfloat *)xa_nnlib_pow2f_coef;
+      pY = (const xtfloatx2*)y;
+      aY = AE_LA64_PP(pY);
+      for (n = 0; n<(blkLen >> 1); n++)
+      {
+
+        XT_LSX2IP(t0, S_rd, 2 * sz_f32);
+        XT_LSX2IP(t1, S_rd, 2 * sz_f32);
+        XT_LSX2IP(p4, S_rd, 2 * sz_f32);       
+        p5 = XT_LSI(TBL_POW2, 5 * 4);
+        p6 = XT_LSI(TBL_POW2, 6 * 4);
+        XT_LASX2IP(y0, aY, pY);
+        /* compute y*log2(x) and separate result into integer and fractional parts */
+        xy = XT_FIROUND_SX2(XT_MUL_SX2(y0, t0));
+        dxy = XT_NEG_SX2(xy);
+        XT_MADD_SX2(dxy, y0, t0);
+        XT_MADD_SX2(dxy, y0, t1);
+        dxy = XT_MIN_SX2(dxy, (xtfloatx2)1.0f);
+        dxy = XT_MAX_SX2(dxy, (xtfloatx2)-1.0f);
+        XT_MADD_SX2(p5, dxy, p4);
+        XT_MADD_SX2(p6, dxy, p5);
+        z0 = p6;
+        /* apply integer part */
+        e0 = XT_TRUNC_SX2(xy, 0);
+        c7i = AE_L32_I(TBL, 7 * 4);/* -252 */
+        c8i = AE_L32_X(TBL, 8 * 4);/* 254 */
+        e0 = AE_MAX32(e0, c7i);
+        e0 = AE_MIN32(e0, c8i);
+        e0 = AE_ADD32(e0, c8i);
+        ex0 = AE_SRAI32(e0, 1);
+        e0 = AE_SUB32(e0, ex0);
+        ex0 = AE_SLLI32(ex0, 23);
+        e0 = AE_SLLI32(e0, 23);
+        c0 = XT_AE_MOVXTFLOATX2_FROMINT32X2(e0);
+        c1 = XT_AE_MOVXTFLOATX2_FROMINT32X2(ex0);
+        z0 = XT_MUL_SX2(z0, c1);
+        z0 = XT_MUL_SX2(z0, c0); //!!!!!!!!!!!!
+        XT_SSX2IP(z0, S_wr, 2 * sz_f32);
+      }
+    }
+    __Pragma("no_reorder");
+    /* */
+    {
+      xtbool2 b_yint, b_e0, b0, b_notspec;
+      xtbool2 b_yeqz, b_yinf, b_xeqz, b_xeq1, b_xinf;
+      xtbool2 b_NaN1, b_NaN2, b_one, b_Inf, b_zero;
+      uint32_t b0i, b1i;
+      uint32_t yeqz, yinf, xeqz, xeq1, xinf, sx, sy, yint;
+      uint32_t one, NaN1, Inf, zero;
+      xtfloatx2 xabs, spec;
+      ae_int32x2 sgn, zi0;
+
+      S_rd = (const xtfloatx2*)scr;
+      pY = (const xtfloatx2*)y;
+      pX = (const xtfloatx2*)x;
+      pZ = (      xtfloatx2*)z;
+      aY = AE_LA64_PP(pY);
+      aX = AE_LA64_PP(pX);
+      aZ = AE_ZALIGN64();
+      for (n = 0; n<(blkLen >> 1); n++)
+      {
+        XT_LSX2IP(z0, S_rd, 2 * sz_f32);
+        XT_LASX2IP(x0, aX, pX);
+        XT_LASX2IP(y0, aY, pY);
+        /* Take sign of x and y */
+        xi0 = XT_AE_MOVINT32X2_FROMXTFLOATX2(x0);
+        yi0 = XT_AE_MOVINT32X2_FROMXTFLOATX2(y0);
+        bsx = XT_OLT_SX2(xi0, (xtfloatx2)0.0f);
+        bsy = XT_OLT_SX2(yi0, (xtfloatx2)0.0f);
+
+        xabs = XT_ABS_SX2(x0);
+        /* check if y is integer */
+        t0 = XT_FITRUNC_SX2(y0);
+        b_yint = XT_OEQ_SX2(t0, y0);
+
+        /* check if y is odd */
+        e0 = XT_TRUNC_SX2(y0, 0); //temp0
+        b_e0 = AE_EQ32(e0, MAX_INT32);//~b_tmp0
+        b0i = AE_MOVAB2(b_e0);
+        b1i = AE_MOVAB2(b_yint);
+        b0i = b1i&(~b0i);
+        b0 = AE_MOVBA2(b0i);
+        AE_MOVF32X2(e0, AE_ZERO32(), b0);
+        e0 = AE_SLLI32(e0, 31);
+        sgn = AE_AND32(e0, xi0);
+        /* process special numbers */
+        b_yeqz = XT_OEQ_SX2((xtfloatx2)0.0f, y0);            /*  y ==0      */
+        b_yinf = XT_OEQ_SX2(XT_ABS_SX2(y0), xa_nnlib_pow_plusInff.f);     /* |y|==Inf    */
+        b_xeqz = XT_OEQ_SX2(x0, (xtfloatx2)0.0f);            /*  x ==0      */
+        b_xeq1 = XT_OEQ_SX2(xabs, (xtfloatx2)1.0f);          /* |x|==1      */
+        b_xinf = XT_OEQ_SX2(xabs, xa_nnlib_pow_plusInff.f);               /* |x|==INF    */
+
+        yint = AE_MOVAB2(b_yint);
+        yeqz = AE_MOVAB2(b_yeqz);
+        yinf = AE_MOVAB2(b_yinf);
+        xeqz = AE_MOVAB2(b_xeqz);
+        xeq1 = AE_MOVAB2(b_xeq1);
+        xinf = AE_MOVAB2(b_xinf);
+        sx = AE_MOVAB2(bsx);
+        sy = AE_MOVAB2(bsy);
+        one = xeq1 & (yinf | (~sx));  /* |x|==1 && ( |y|==Inf || x>0 )                       */
+        one = one | yeqz;           /* ( |x|==1 && ( |y|==Inf || x>0 ) ) || y==0 --> z=1.0 */
+        NaN1 = sx&(~yint);          /* x<0 && y is not an integer --> z=NaN                */
+        Inf = xinf&(~sy);          /* x==INF && y>0 --> z=INF */
+        Inf = Inf | (xeqz & sy);    /* x==0   && y<0 --> z=INF */
+        zero = xeqz &(~sy);         /* x==0   && y>0 --> z=0.0 */
+        zero = zero | (xinf & sy);  /* x==INF && y<0 --> z=0.0 */
+
+        b_NaN1 = AE_MOVBA2(NaN1);
+        b_NaN2 = XT_UN_SX2(x0, y0);         /* isnan(x) || isnan(y) --> z=NaN                      */
+        b_one = AE_MOVBA2(one);
+        b_Inf = AE_MOVBA2(Inf);
+        b_zero = AE_MOVBA2(zero);
+
+        /* Save special numbers and mask for special numbers */
+        spec = (xtfloatx2)xa_nnlib_pow_qNaNf.f;
+        XT_MOVF_SX2(spec, half, b_NaN1);
+        XT_MOVT_SX2(spec, _0, b_zero);
+        XT_MOVT_SX2(spec, xa_nnlib_pow_plusInff.f, b_Inf);
+        XT_MOVT_SX2(spec, xa_nnlib_pow_qNaNf.f, b_NaN2);
+        XT_MOVT_SX2(spec, _1, b_one);
+
+        b_notspec = XT_OEQ_SX2(spec, half);
+        /* Replace result with special numbers if needed */
+        XT_MOVF_SX2(z0, spec, b_notspec);
+        /* Restore sign and store result */
+        zi0 = XT_AE_MOVINT32X2_FROMXTFLOATX2(z0);
+        zi0 = AE_XOR32(zi0, sgn);
+        z0 = XT_AE_MOVXTFLOATX2_FROMINT32X2(zi0);
+        XT_SASX2IP(z0, aZ, pZ);
+      }    
+    }
+    XT_SASX2POSFP(aZ, pZ);
+  }
+} /* mypowf() */
+void xa_nn_elm_pow_f32(   FLOAT32 * restrict z, 
+            const FLOAT32 * restrict x, 
+            const FLOAT32 * restrict y, 
+            int N )
+{
+  const int blkSize = MAX_ALLOCA_SZ/sz_f32;
+  /* Allocate a fixed-size scratch area on the stack. */
+  FLOAT32 ALIGN(16) scr[blkSize];
+  int M;
+  if ( N<=0 ) return;
+  M=N&~1;
+  if ( M )
+  {
+    mypowf(scr,z,x,y,M); 
+    y += M;
+    x += M;
+    z += M;
+    N&=1;
+  }
+  if (N) 
+  {     // processing the tail
+    static const int32_t c_tbl[] =
+    {
+      -126,
+      -150,
+      (int32_t)0x007FFFFF,/* max denormalized floating-point number / mantissa mask */
+      (int32_t)0x4B800000,/* 2^24 */
+      (int32_t)0x3F3504F3,/* sqrt(0.5) */
+      (int32_t)0x3F000000,/*  0.5 */
+      (int32_t)0xBF000000,/* -0.5 */
+      -252,
+      254
+    };
+    xtfloat x0, y0, t0, ef0, t1, t2;
+    xtfloat xy, dxy, z0, c0, c1;
+    xtfloat p0, p1, p2, p3, p4, p5, p6, p7, p8, p9;
+    xtfloat p10, p11, p12, p13, w0, w1;
+    xtbool bdenorm, bsmall;
+    ae_int32 e0, xi0, ex0;
+    x0=XT_LSI((const xtfloat*)x,0);
+
+    x0 = XT_ABS_S(x0);
+
+    /* process denormalized values */
+    bdenorm = xtbool2_extract_0(XT_OLE_S(x0, XT_LSI((xtfloat*)c_tbl, 2 * 4)));
+    t0 = XT_MUL_S(x0, XT_LSI((xtfloat*)c_tbl, 3 * 4));
+    XT_MOVT_S(x0, t0, (bdenorm));
+    e0 = AE_L32_I((ae_int32 *)c_tbl, 0 * 4);;
+    AE_MOVT_32(e0, AE_L32_I((ae_int32 *)c_tbl, 1 * 4), (bdenorm));
+    /* extract exponent */
+    xi0 = XT_AE_MOVINT32X2_FROMXTFLOATX2(x0);
+    ex0 = AE_SRLI32(xi0, 23);
+    e0 = AE_ADD32(e0, ex0);
+    /* extract mantissa */
+    ex0 = XT_AE_MOVINT32X2_FROMXTFLOATX2(XT_LSI((xtfloat*)c_tbl, 2 * 4));/* load mantissa mask */ //!!!!!!!!!!!!!
+    xi0 = AE_AND32(xi0, ex0);
+    xi0 = AE_OR32(xi0, AE_L32_I((ae_int32 *)c_tbl, 5 * 4));
+    x0 = XT_AE_MOVXTFLOATX2_FROMINT32X2(xi0);
+    /* adjust the mantissa to range [ sqrt(0.5) ; sqrt(2.0) ) */
+    
+    bsmall = xtbool2_extract_0(XT_OLT_S(x0, XT_LSI((xtfloat*)c_tbl, 4 * 4)));
+
+
+    t0 = XT_ADD_S(x0, x0);
+    ex0 = AE_SUB32(e0, 1);
+    XT_MOVT_S(x0, t0, bsmall);
+    AE_MOVT_32(e0, ex0, bsmall);
+    x0 = XT_SUB_S(1.0f, x0); //!!!
+    ef0 = XT_FLOAT_S(e0, 0); //!!!
+
+    /* evaluate polynomial approximation */
+    /* Load table of coefficients */
+
+    p0 = XT_LSI((const xtfloat *)xa_nnlib_log2f_coef, 0 * 4);
+    p1 = XT_LSI((const xtfloat *)xa_nnlib_log2f_coef, 1 * 4);
+    p2 = XT_LSI((const xtfloat *)xa_nnlib_log2f_coef, 2 * 4);
+    p3 = XT_LSI((const xtfloat *)xa_nnlib_log2f_coef, 3 * 4);
+    p4 = XT_LSI((const xtfloat *)xa_nnlib_log2f_coef, 4 * 4);
+    p5 = XT_LSI((const xtfloat *)xa_nnlib_log2f_coef, 5 * 4);
+    p6 = XT_LSI((const xtfloat *)xa_nnlib_log2f_coef, 6 * 4);
+    p7 = XT_LSI((const xtfloat *)xa_nnlib_log2f_coef, 7 * 4);
+    p8 = XT_LSX((const xtfloat *)xa_nnlib_log2f_coef, 8 * 4);
+    p9 = XT_LSX((const xtfloat *)xa_nnlib_log2f_coef, 9 * 4);
+    
+
+    XT_MADD_S(p1, x0, p0);
+    XT_MADD_S(p2, x0, p1);
+    XT_MADD_S(p3, x0, p2);
+    XT_MADD_S(p4, x0, p3);
+    XT_MADD_S(p5, x0, p4);
+    XT_MADD_S(p6, x0, p5);
+    XT_MADD_S(p7, x0, p6);
+    XT_MADD_S(p8, x0, p7);
+    XT_MADD_S(p9, x0, p8);
+    t2 = p9;
+
+
+    p10 = XT_LSX((const xtfloat *)xa_nnlib_log2f_coef, 10 * 4);
+    p11 = XT_LSX((const xtfloat *)xa_nnlib_log2f_coef, 11 * 4);
+    p12 = XT_LSX((const xtfloat *)xa_nnlib_log2f_coef, 12 * 4);
+    p13 = XT_LSX((const xtfloat *)xa_nnlib_log2f_coef, 13 * 4);
+
+    /* next coefficients are computed in extended precision */
+    t0 = XT_MUL_S(x0, t2); t1 = t0;
+    XT_MSUB_S(t1, x0, t2);
+    w0 = XT_ADD_S(t0, p10);
+    w1 = XT_SUB_S(w0, p10);
+    w1 = XT_SUB_S(t0, w1);
+    w1 = XT_SUB_S(w1, t1);
+    t0 = w0; t1 = w1;
+    w0 = XT_MUL_S(x0, t0); w1 = w0;
+    XT_MSUB_S(w1, x0, t0); t0 = w0;
+    XT_MSUB_S(w1, x0, t1); t1 = w1;
+    w0 = XT_ADD_S(t0, p11);
+    w1 = XT_SUB_S(w0, p11);
+    w1 = XT_SUB_S(t0, w1);
+    w1 = XT_SUB_S(w1, t1);
+    t0 = w0; t1 = w1;
+    x0 = XT_NEG_S(x0);
+    w0 = XT_MUL_S(x0, t0); w1 = w0;
+    XT_MSUB_S(w1, x0, t0); t0 = w0;
+    XT_MSUB_S(w1, x0, t1); t1 = w1;
+    /* multiply by log2(e) */
+    w0 = XT_MUL_S(t0, p12); w1 = w0;
+    XT_MSUB_S(w1, t0, p12);
+    XT_MADD_S(w1, t1, p12);
+    XT_MSUB_S(w1, t0, p13);
+    t0 = w0; t1 = w1;
+    /* add exponent */
+    w0 = XT_ADD_S(t0, ef0);
+    w1 = XT_SUB_S(w0, ef0);
+    w1 = XT_SUB_S(t0, w1);
+    t1 = XT_SUB_S(w1, t1);//!!!!
+    t0 = w0; // !!!!!
+
+    /* compute y*log2(x) and separate result into integer and fractional parts */
+    y0 = XT_LSI((const xtfloat*)y, 0);
+    xy = XT_FIROUND_S(XT_MUL_S(y0, t0));
+    dxy = XT_NEG_S(xy);
+    XT_MADD_S(dxy, y0, t0);
+    XT_MADD_S(dxy, y0, t1);
+    dxy = XT_MIN_S(dxy, (xtfloatx2)1.0f);
+    dxy = XT_MAX_S(dxy, (xtfloatx2)-1.0f);
+    /* compute 2^fract */
+    p0 = XT_LSI( (const xtfloat *)xa_nnlib_pow2f_coef, 0 * 4);
+    p1 = XT_LSI( (const xtfloat *)xa_nnlib_pow2f_coef, 1 * 4);
+    p2 = XT_LSI( (const xtfloat *)xa_nnlib_pow2f_coef, 2 * 4);
+    p3 = XT_LSI( (const xtfloat *)xa_nnlib_pow2f_coef, 3 * 4);
+    p4 = XT_LSI( (const xtfloat *)xa_nnlib_pow2f_coef, 4 * 4);
+    p5 = XT_LSI( (const xtfloat *)xa_nnlib_pow2f_coef, 5 * 4);
+    p6 = XT_LSI( (const xtfloat *)xa_nnlib_pow2f_coef, 6 * 4);
+    /* NOTE: do not change the order of computations and way of polynomial decomposition ! */
+    XT_MADD_S(p1, dxy, p0);
+    XT_MADD_S(p2, dxy, p1);
+    XT_MADD_S(p3, dxy, p2);
+    XT_MADD_S(p4, dxy, p3);
+    XT_MADD_S(p5, dxy, p4);
+    XT_MADD_S(p6, dxy, p5);
+    z0 = p6;
+    /* apply integer part */
+    e0 = XT_TRUNC_SX2(xy, 0);
+    e0 = AE_MAX32(e0, AE_L32_I((ae_int32 *)c_tbl, 7 * 4));
+    e0 = AE_MIN32(e0, AE_L32_X((ae_int32 *)c_tbl, 8 * 4));
+    e0 = AE_ADD32(e0, AE_L32_X((ae_int32 *)c_tbl, 8 * 4));
+    ex0 = AE_SRAI32(e0, 1);
+    e0 = AE_SUB32(e0, ex0);
+    ex0 = AE_SLLI32(ex0, 23);
+    e0 = AE_SLLI32(e0, 23);
+    c0 = XT_AE_MOVXTFLOATX2_FROMINT32X2(e0);
+    c1 = XT_AE_MOVXTFLOATX2_FROMINT32X2(ex0);
+    z0 = XT_MUL_S(z0, c1);
+    z0 = XT_MUL_S(z0, c0); //!!!!!!!!!!!!
+
+
+    /* Take sign of x and y */
+    {
+      xtbool2 bsx, bsy, b_yint, b_e0, b0, b_notspec;
+
+      xtbool2 b_yeqz, b_yinf, b_xeqz, b_xeq1, b_xinf;
+      xtbool2 b_NaN1, b_NaN2, b_one, b_Inf, b_zero;
+      uint32_t b0i, b1i;
+      uint32_t yeqz, yinf, xeqz, xeq1, xinf, sx, sy, yint;
+      uint32_t one, NaN1, Inf, zero;
+      xtfloat xabs, spec;
+      ae_int32 sgn, zi0;
+
+      x0 = XT_LSI((const xtfloat*)x, 0);
+      y0 = XT_LSI((const xtfloat*)y, 0);
+      xi0 = XT_AE_MOVINT32X2_FROMXTFLOATX2(x0);
+      bsx = (XT_OLT_S(x0, (xtfloat)0.0f));
+      bsy = (XT_OLT_S(y0, (xtfloat)0.0f));
+
+      xabs = XT_ABS_S(x0);
+      /* check if y is integer */
+      t0 = XT_FITRUNC_S(y0);
+      b_yint = (XT_OEQ_S(t0, y0));
+  
+      /* check if y is odd */
+      e0 = XT_TRUNC_S(y0, 0); //temp0
+      b_e0 = (AE_EQ32(e0, MAX_INT32));//~b_tmp0
+      b0i = AE_MOVAB2(b_e0);
+      b1i = AE_MOVAB2(b_yint);
+      b0i = b1i&(~b0i);
+      b0 = AE_MOVBA2(b0i);
+      AE_MOVF_32(e0, AE_ZERO32(), xtbool2_extract_0(b0));
+      e0 = AE_SLLI32(e0, 31);
+      sgn = AE_AND32(e0, xi0);
+      /* process special numbers */
+      b_yeqz = (XT_OEQ_S((xtfloatx2)0.0f, y0));            /*  y ==0      */
+      b_yinf = (XT_OEQ_S(XT_ABS_SX2(y0), xa_nnlib_pow_plusInff.f));     /* |y|==Inf    */
+      b_xeqz = (XT_OEQ_S(x0, (xtfloatx2)0.0f));            /*  x ==0      */
+      b_xeq1 = (XT_OEQ_S(xabs, (xtfloatx2)1.0f));          /* |x|==1      */
+      b_xinf = (XT_OEQ_S(xabs, xa_nnlib_pow_plusInff.f));               /* |x|==INF    */
+  
+      yint = AE_MOVAB2 (b_yint);
+      yeqz = AE_MOVAB2 (b_yeqz);
+      yinf = AE_MOVAB2 (b_yinf);
+      xeqz = AE_MOVAB2 (b_xeqz);
+      xeq1 = AE_MOVAB2 (b_xeq1);
+      xinf = AE_MOVAB2 (b_xinf);
+      sx = AE_MOVAB2 (bsx);
+      sy = AE_MOVAB2 (bsy);
+      
+      one = xeq1 & (yinf | (~sx));  /* |x|==1 && ( |y|==Inf || x>0 )                       */
+      one = one | yeqz;           /* ( |x|==1 && ( |y|==Inf || x>0 ) ) || y==0 --> z=1.0 */
+      NaN1 = sx&(~yint);          /* x<0 && y is not an integer --> z=NaN                */
+      Inf = xinf&(~sy);          /* x==INF && y>0 --> z=INF */
+      Inf = Inf | (xeqz & sy);    /* x==0   && y<0 --> z=INF */
+      zero = xeqz &(~sy);         /* x==0   && y>0 --> z=0.0 */
+      zero = zero | (xinf & sy);  /* x==INF && y<0 --> z=0.0 */
+  
+      b_NaN1 = AE_MOVBA2(NaN1);
+      b_NaN2 = XT_UN_SX2(x0, y0);         /* isnan(x) || isnan(y) --> z=NaN                      */
+      b_one = AE_MOVBA2(one);
+      b_Inf = AE_MOVBA2(Inf);
+      b_zero = AE_MOVBA2(zero);
+  
+      /* Save special numbers and mask for special numbers */
+      spec = (xtfloat)xa_nnlib_pow_qNaNf.f;
+      XT_MOVF_S(spec, 0.5f, xtbool2_extract_0(b_NaN1));
+      XT_MOVT_S(spec, 0.0f, xtbool2_extract_0(b_zero));
+      XT_MOVT_S(spec, xa_nnlib_pow_plusInff.f, xtbool2_extract_0(b_Inf));
+      XT_MOVT_S(spec, xa_nnlib_pow_qNaNf.f, xtbool2_extract_0(b_NaN2));
+      XT_MOVT_S(spec, 1.0f, xtbool2_extract_0(b_one));
+  
+      b_notspec = XT_OEQ_S(spec, 0.5f);
+      /* Replace result with special numbers if needed */
+      XT_MOVF_S(z0, spec, xtbool2_extract_0(b_notspec));
+      /* Restore sign and store result */
+      zi0 = XT_AE_MOVINT32X2_FROMXTFLOATX2(z0);
+      zi0 = AE_XOR32(zi0, sgn);
+      z0 = XT_AE_MOVXTFLOATX2_FROMINT32X2(zi0);
+
+      XT_SSI(z0,(xtfloat*)z,0);
+    
+    }
+  }
+
+} /* vec_powf() */
+#else
+#define sz_f32    (int)sizeof(FLOAT32)
+void xa_nn_elm_pow_f32(FLOAT32 * restrict z,
+  const FLOAT32 * restrict x,
+  const FLOAT32 * restrict y,
+  int N)
+{
+
+  const int blkSizef = MAX_ALLOCA_SZ / sz_f32;
+  /* Allocate a fixed-size scratch area on the stack. */
+  float ALIGN(16) scr[blkSizef];
+  /* Table of different constants used in computations */
+  static const int32_t c_tbl[] =
+  {
+    -126,
+    -150,
+    (int32_t)0x007FFFFF,/* max denormalized floating-point number / mantissa mask */
+    (int32_t)0x4B800000,/* 2^24 */
+    (int32_t)0x3F3504F3,/* sqrt(0.5) */
+    (int32_t)0x3F000000,/*  0.5 */
+    (int32_t)0xBF000000,/* -0.5 */
+    -252,
+    254
+  };
+  int n;
+  const xtfloat     *          pX;
+  const xtfloat     *          pY;
+
+  const xtfloat     * restrict S_rd;
+  xtfloat     * restrict S_wr;
+  xtfloat     * restrict pZ;
+  const ae_int32      * restrict TBL;
+  const  xtfloat      * restrict TBL_LOG2;
+  const  xtfloat      * restrict TBL_POW2;
+  xtfloat x0, y0, z0, t0, t1, ef0;
+  xtfloat c2f, c3f, c4f;
+  xtfloat _0, _1, half;
+  ae_int32x2 c0i, c1i, c5i, c6i, c7i, c8i;
+  ae_int32 e0, xi0, yi0, ex0;
+  xtbool bsx, bsy, bdenorm, bsmall;
+
+  /* overall number of blocks; number of values in the current block */
+  int blkLen;
+  /* Block size, blkLen <= blkSize */
+  const int blkSize = MAX_ALLOCA_SZ / (3 * sz_f32);
+
+
+  if (N <= 0) return;
+
+  NASSERT_ALIGN16(scr);
+
+  /*
+  * Data are processed in blocks of scratch area size. Further, the algorithm
+  * implementation is splitted in order to feed the optimizing compiler with a
+  * few loops of managable size.
+  */
+
+  blkLen = 0;
+  TBL = (const ae_int32 *)c_tbl;
+  for (; N>0; N -= blkLen, x += blkSize, y += blkSize, z += blkSize)
+  {
+    blkLen = XT_MIN(N, blkSize);
+    _0 = 0.0f;
+    _1 = (1.0f);
+    half = (0.5f);
+    {
+      pX   = (const xtfloat*)x;
+      S_wr = (      xtfloat*)scr;
+     
+      for (n = 0; n<(blkLen); n++)
+      {
+        XT_LSIP(x0, pX, sz_f32);
+       
+        x0 = XT_ABS_S(x0);
+        c0i = AE_L32_I(TBL, 0 * 4); /* -126 */
+        c1i = AE_L32_I(TBL, 1 * 4); /* -150 */
+        c2f = XT_LSI((xtfloat*)TBL, 2 * 4);
+        c3f = XT_LSI((xtfloat*)TBL, 3 * 4);
+        /* process denormalized values */
+        bdenorm = XT_OLE_S(x0, c2f);
+        t0 = XT_MUL_S(x0, c3f);
+        XT_MOVT_S(x0, t0, bdenorm);
+        e0 = c0i;
+        
+        AE_MOVT_32(e0, c1i, bdenorm);
+        /* extract exponent */
+        xi0 = XT_RFR(x0);
+        ex0 = AE_SRLI32(xi0, 23);
+        e0 = AE_ADD32(e0, ex0);
+        /* extract mantissa */
+        ex0 = XT_RFR(c2f);/* load mantissa mask */ //!!!!!!!!!!!!!
+        c5i = AE_L32_I(TBL, 5 * 4);/*  0.5 */
+        xi0 = AE_AND32(xi0, ex0);
+        xi0 = AE_OR32(xi0, c5i);
+        x0 = XT_WFR(xi0);
+        /* adjust the mantissa to range [ sqrt(0.5) ; sqrt(2.0) ) */
+        c4f = XT_LSI((xtfloat*)TBL, 4 * 4);
+        bsmall = XT_OLT_S(x0, c4f);
+        t0 = XT_ADD_S(x0, x0);
+        ex0 = AE_SUB32(e0, 1);
+        XT_MOVT_S(x0, t0, bsmall);
+        AE_MOVT_32(e0, ex0, bsmall);
+        x0 = XT_SUB_S(_1, x0); //!!!
+        ef0 = XT_FLOAT_S(e0, 0); //!!!
+        XT_SSIP(x0, S_wr, sz_f32);
+        XT_SSIP(ef0, S_wr, 2 * sz_f32);
+
+      }
+    }
+    __Pragma("no_reorder");
+    /* */
+    {
+      xtfloat p0, p1, p2, p3, p4, p5, p6, p7, p8, p9;
+      xtfloat p10, p11, p12, p13;
+      xtfloat t2, w0, w1;
+      S_wr = (      xtfloat*)scr + 2;
+      S_rd = (const xtfloat*)scr;
+      TBL_LOG2 = (const xtfloat *)xa_nnlib_log2f_coef;
+   
+      for (n = 0; n<(blkLen); n++)
+      {
+        XT_LSIP(x0, S_rd, 3*sz_f32);
+
+        /* evaluate polynomial approximation */
+        /* Load table of coefficients */
+
+         p0 = XT_LSI(TBL_LOG2, 0 * 4);
+         p1 = XT_LSI(TBL_LOG2, 1 * 4);
+         p2 = XT_LSI(TBL_LOG2, 2 * 4);
+         p3 = XT_LSI(TBL_LOG2, 3 * 4);
+         p4 = XT_LSI(TBL_LOG2, 4 * 4);
+         p5 = XT_LSI(TBL_LOG2, 5 * 4);
+         p6 = XT_LSI(TBL_LOG2, 6 * 4);
+         p7 = XT_LSI(TBL_LOG2, 7 * 4);
+         p8 = XT_LSX(TBL_LOG2, 8 * 4);
+         p9 = XT_LSX(TBL_LOG2, 9 * 4);
+       
+         XT_MADD_S(p1, x0, p0);
+         XT_MADD_S(p2, x0, p1);
+         XT_MADD_S(p3, x0, p2);
+         XT_MADD_S(p4, x0, p3);
+         XT_MADD_S(p5, x0, p4);
+         XT_MADD_S(p6, x0, p5);
+         XT_MADD_S(p7, x0, p6);
+         XT_MADD_S(p8, x0, p7);
+         XT_MADD_S(p9, x0, p8);
+         t2 = p9;
+         XT_SSIP(t2, S_wr, 3 * sz_f32);
+      }
+      S_wr = (      xtfloat*)scr;
+      S_rd = (const xtfloat*)scr;
+ 
+      for (n = 0; n<(blkLen); n++)
+      {
+        p10 = XT_LSX(TBL_LOG2, 10 * 4);
+        p11 = XT_LSX(TBL_LOG2, 11 * 4);
+        p12 = XT_LSX(TBL_LOG2, 12 * 4);
+        p13 = XT_LSX(TBL_LOG2, 13 * 4);
+      
+        XT_LSIP(x0, S_rd, sz_f32);
+        XT_LSIP(ef0, S_rd, sz_f32);
+        XT_LSIP(t2, S_rd, sz_f32);
+      
+        /* next coefficients are computed in extended precision */
+        t0 = XT_MUL_S(x0, t2); t1 = t0;
+        XT_MSUB_S(t1, x0, t2);
+        w0 = XT_ADD_S(t0, p10);
+        w1 = XT_SUB_S(w0, p10);
+        w1 = XT_SUB_S(t0, w1);
+        w1 = XT_SUB_S(w1, t1);
+        t0 = w0; t1 = w1;
+        w0 = XT_MUL_S(x0, t0); w1 = w0;
+        XT_MSUB_S(w1, x0, t0); t0 = w0;
+        XT_MSUB_S(w1, x0, t1); t1 = w1;
+        w0 = XT_ADD_S(t0, p11);
+        w1 = XT_SUB_S(w0, p11);
+        w1 = XT_SUB_S(t0, w1);
+        w1 = XT_SUB_S(w1, t1);
+        t0 = w0; t1 = w1;
+        x0 = XT_NEG_S(x0);
+        w0 = XT_MUL_S(x0, t0); w1 = w0;
+        XT_MSUB_S(w1, x0, t0); t0 = w0;
+        XT_MSUB_S(w1, x0, t1); t1 = w1;
+        /* multiply by log2(e) */
+        w0 = XT_MUL_S(t0, p12); w1 = w0;
+        XT_MSUB_S(w1, t0, p12);
+        XT_MADD_S(w1, t1, p12);
+        XT_MSUB_S(w1, t0, p13);
+        t0 = w0; t1 = w1;
+        /* add exponent */
+        w0 = XT_ADD_S(t0, ef0);
+        w1 = XT_SUB_S(w0, ef0);
+        w1 = XT_SUB_S(t0, w1);
+        t1 = XT_SUB_S(w1, t1);//!!!!
+        t0 = w0; // !!!!!
+        XT_SSIP(t0, S_wr, sz_f32);
+        XT_SSIP(t1, S_wr, sz_f32);
+      }
+    }
+    __Pragma("no_reorder");
+    /* */
+    {
+      xtfloat xy, dxy, c0, c1, _m1;;
+      xtfloat p0, p1, p2, p3, p4, p5, p6;
+      S_wr = (      xtfloat*)scr;
+      S_rd = (const xtfloat*)scr;
+      TBL_POW2 = (const xtfloat *)xa_nnlib_pow2f_coef;
+      pY = (const xtfloat*)y;
+      _m1 = -1.0f;
+      for (n = 0; n<(blkLen); n++)
+      {
+        XT_LSIP(t0, S_rd, sz_f32);
+        XT_LSIP(t1, S_rd, sz_f32);
+        XT_LSIP(y0, pY, sz_f32);
+        /* compute y*log2(x) and separate result into integer and fractional parts */
+        xy = XT_FLOAT_S(XT_ROUND_S(XT_MUL_S(y0, t0), 0), 0);
+        dxy = XT_NEG_S(xy);
+        XT_MADD_S(dxy, y0, t0);
+        XT_MADD_S(dxy, y0, t1);
+        c5i = AE_L32_I(TBL, 5 * 4);/*  0.5 */
+        c6i = AE_L32_I(TBL, 6 * 4);/*  -0.5 */
+        dxy = XT_MIN_S(dxy, _1);
+        dxy = XT_MAX_S(dxy, _m1);
+        /* compute 2^fract */
+        p0 = XT_LSI(TBL_POW2, 0 * 4);
+        p1 = XT_LSI(TBL_POW2, 1 * 4);
+        p2 = XT_LSI(TBL_POW2, 2 * 4);
+        p3 = XT_LSI(TBL_POW2, 3 * 4);
+        p4 = XT_LSI(TBL_POW2, 4 * 4);
+        p5 = XT_LSI(TBL_POW2, 5 * 4);
+        p6 = XT_LSI(TBL_POW2, 6 * 4);
+        /* NOTE: do not change the order of computations and way of polynomial decomposition ! */
+        XT_MADD_S(p1, dxy, p0);
+        XT_MADD_S(p2, dxy, p1);
+        XT_MADD_S(p3, dxy, p2);
+        XT_MADD_S(p4, dxy, p3);
+        XT_MADD_S(p5, dxy, p4);
+        XT_MADD_S(p6, dxy, p5);
+        z0 = p6;
+        /* apply integer part */
+        e0 = XT_TRUNC_S(xy, 0);
+        c7i = AE_L32_I(TBL, 7 * 4);/* -252 */
+        c8i = AE_L32_X(TBL, 8 * 4);/* 254 */
+        e0 = AE_MAX32(e0, c7i);
+        e0 = AE_MIN32(e0, c8i);
+        e0 = AE_ADD32(e0, c8i);
+        ex0 = AE_SRAI32(e0, 1);
+        e0 = AE_SUB32(e0, ex0);
+        ex0 = AE_SLLI32(ex0, 23);
+        e0 = AE_SLLI32(e0, 23);
+        
+        c0 = XT_WFR(e0);
+        c1 = XT_WFR(ex0);
+        z0 = XT_MUL_S(z0, c1);
+        z0 = XT_MUL_S(z0, c0); //!!!!!!!!!!!!
+        XT_SSIP(z0, S_wr, sz_f32);
+
+      }
+    }
+    __Pragma("no_reorder");
+    /* */
+    {
+      xtbool b_yint, b_e0, b0, b_notspec;
+      xtbool b_yeqz, b_yinf, b_xeqz, b_xeq1, b_xinf;
+      xtbool b_NaN1, b_NaN2, b_one, b_Inf, b_zero;
+      uint32_t b0i, b1i;
+      uint32_t yeqz, yinf, xeqz, xeq1, xinf, sx, sy, yint;
+      uint32_t one, NaN1, Inf, zero;
+      xtfloat xabs, spec;
+      ae_int32x2 sgn, zi0;
+
+      S_rd = (const xtfloat*)scr;
+      pY = (const xtfloat*)y;
+      pX = (const xtfloat*)x;
+      pZ = (xtfloat*)z;
+
+      for (n = 0; n<(blkLen); n++)
+      {
+        XT_LSIP(z0, S_rd, sz_f32);
+        XT_LSIP(x0, pX, sz_f32);
+        XT_LSIP(y0, pY, sz_f32);
+
+        /* Take sign of x and y */
+        xi0 = XT_RFR(x0);
+        yi0 = XT_RFR(y0);
+        bsx = XT_OLT_S(x0, (xtfloat)0.0f);
+        bsy = XT_OLT_S(y0, (xtfloat)0.0f);
+      
+        xabs = XT_ABS_S(x0);
+        /* check if y is integer */
+        {   /* validate if y is integral - all numbers bigger than 2^23 are assumed as integral */
+          xtfloat t, c;
+          t = XT_ABS_S((xtfloat)y0);
+          c = 8388608.f;
+          XT_MOVT_S(c, t, XT_ULT_S(t, 8388608.f));
+          t = c;
+          t0 = XT_FLOAT_S(XT_TRUNC_S(t, 0), 0);
+          b_yint = XT_OEQ_S(XT_FLOAT_S(XT_TRUNC_S(t, 0), 0), t);
+        }
+      
+        /* check if y is odd */
+        e0 = XT_TRUNC_S(y0, 0); //temp0
+        b_e0 = xtbool2_extract_0(AE_EQ32(e0, MAX_INT32));//~b_tmp0
+        b0i = AE_MOVAB(b_e0);
+        b1i = AE_MOVAB(b_yint);
+        b0i = b1i&(~b0i);
+        b0 = AE_MOVBA(b0i);
+        AE_MOVF_32(e0, AE_ZERO32(), b0);
+        e0 = AE_SLLI32(e0, 31);
+        sgn = AE_AND32(e0, xi0);
+        /* process special numbers */
+        b_yeqz = XT_OEQ_S((xtfloat)0.0f, y0);            /*  y ==0      */
+        b_yinf = XT_OEQ_S(XT_ABS_S(y0), xa_nnlib_pow_plusInff.f);     /* |y|==Inf    */
+        b_xeqz = XT_OEQ_S(x0, (xtfloat)0.0f);            /*  x ==0      */
+        b_xeq1 = XT_OEQ_S(xabs, (xtfloat)1.0f);          /* |x|==1      */
+        b_xinf = XT_OEQ_S(xabs, xa_nnlib_pow_plusInff.f);               /* |x|==INF    */
+      
+        yint = AE_MOVAB(b_yint);
+        yeqz = AE_MOVAB(b_yeqz);
+        yinf = AE_MOVAB(b_yinf);
+        xeqz = AE_MOVAB(b_xeqz);
+        xeq1 = AE_MOVAB(b_xeq1);
+        xinf = AE_MOVAB(b_xinf);
+        sx = AE_MOVAB(bsx);
+        sy = AE_MOVAB(bsy);
+        one = xeq1 & (yinf | (~sx));  /* |x|==1 && ( |y|==Inf || x>0 )                       */
+        one = one | yeqz;           /* ( |x|==1 && ( |y|==Inf || x>0 ) ) || y==0 --> z=1.0 */
+        NaN1 = sx&(~yint);          /* x<0 && y is not an integer --> z=NaN                */
+        Inf = xinf&(~sy);          /* x==INF && y>0 --> z=INF */
+        Inf = Inf | (xeqz & sy);    /* x==0   && y<0 --> z=INF */
+        zero = xeqz &(~sy);         /* x==0   && y>0 --> z=0.0 */
+        zero = zero | (xinf & sy);  /* x==INF && y<0 --> z=0.0 */
+      
+        b_NaN1 = AE_MOVBA(NaN1);
+        b_NaN2 = XT_UN_S(x0, y0);         /* isnan(x) || isnan(y) --> z=NaN                      */
+        b_one = AE_MOVBA(one);
+        b_Inf = AE_MOVBA(Inf);
+        b_zero = AE_MOVBA(zero);
+      
+        /* Save special numbers and mask for special numbers */
+        spec = (xtfloat)xa_nnlib_pow_qNaNf.f;
+        XT_MOVF_S(spec, half, b_NaN1);
+        XT_MOVT_S(spec, _0, b_zero);
+        XT_MOVT_S(spec, xa_nnlib_pow_plusInff.f, b_Inf);
+        XT_MOVT_S(spec, xa_nnlib_pow_qNaNf.f, b_NaN2);
+        XT_MOVT_S(spec, _1, b_one);
+      
+        b_notspec = XT_OEQ_S(spec, half);
+        /* Replace result with special numbers if needed */
+        XT_MOVF_S(z0, spec, b_notspec);
+        /* Restore sign and store result */
+        zi0 = XT_RFR(z0);
+        zi0 = AE_XOR32(zi0, sgn);
+        z0 = XT_WFR(zi0);
+        XT_SSIP(z0, pZ, sz_f32);
+      }
+    }
+  }
+
+} /* vec_powf() */
+#endif

From 1dab7a9ff145cd74024388963c15c888752b7958 Mon Sep 17 00:00:00 2001
From: dijopaul <87994875+dijopaul@users.noreply.github.com>
Date: Wed, 27 Nov 2024 21:28:10 +0530
Subject: [PATCH 08/27] Upgrade nnlib to latest 4.2.0 (#7105)

Upgrading nnlib to latest version 4.2.0
---
 backends/cadence/hifi/third-party/nnlib/nnlib-hifi4 | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/backends/cadence/hifi/third-party/nnlib/nnlib-hifi4 b/backends/cadence/hifi/third-party/nnlib/nnlib-hifi4
index 6a9ea45e23..102944a6f7 160000
--- a/backends/cadence/hifi/third-party/nnlib/nnlib-hifi4
+++ b/backends/cadence/hifi/third-party/nnlib/nnlib-hifi4
@@ -1 +1 @@
-Subproject commit 6a9ea45e23ef591fe207442df33a5ebe88bbe8de
+Subproject commit 102944a6f76a0de4d81adc431f3f132f517aa87f

From d136206861a8d00c61475d133f4a3e9634b12bb7 Mon Sep 17 00:00:00 2001
From: ckmadhira <cmadhira@cadence.com>
Date: Wed, 27 Nov 2024 21:38:29 +0530
Subject: [PATCH 09/27] For broadcast, added support to process distinct input
 dimensions (#7107)

For broadcast, added support for distinct dimensions for both the
inputs. Also, added support for processing dimension size more than 5.

Signed-off-by: cmadhira@cadence.com <mckala@invecas.com>
Co-authored-by: cmadhira@cadence.com <mckala@invecas.com>
---
 .../cadence/fusion_g3/operators/op_add.cpp    | 48 +++++++++++++------
 .../cadence/fusion_g3/operators/op_mul.cpp    | 48 +++++++++++++------
 2 files changed, 66 insertions(+), 30 deletions(-)

diff --git a/backends/cadence/fusion_g3/operators/op_add.cpp b/backends/cadence/fusion_g3/operators/op_add.cpp
index 6dc710ce6e..551c6652f1 100644
--- a/backends/cadence/fusion_g3/operators/op_add.cpp
+++ b/backends/cadence/fusion_g3/operators/op_add.cpp
@@ -76,27 +76,45 @@ Tensor& add_out(
   int inp2_shape[kTensorDimensionLimit];
   int out_shape[kTensorDimensionLimit];
 
-  /* input shapes and output shapes */
-  for (auto i = 0; i < a_size.size(); i++) {
-    inp1_shape[i] = a_size[i];
-  }
-
-  for (auto i = 0; i < b_size.size(); i++) {
-    inp2_shape[i] = b_size[i];
-  }
-
-  for (auto i = 0; i < out_size.size(); i++) {
-    out_shape[i] = out_size[i];
-  }
-
   /*find broadcast*/
   const bool a_is_broadcasted = !out.sizes().equals(a.sizes());
   const bool b_is_broadcasted = !out.sizes().equals(b.sizes());
   const bool broadcast = (a_is_broadcasted || b_is_broadcasted);
 
   int max_dim = a.dim() > b.dim() ? a.dim() : b.dim();
+  max_dim = out.dim() > max_dim ? out.dim() : max_dim;
 
-  if (compute_type == ScalarType::Int) {
+  bool optimized = 1;
+
+  if ((a.dim() == 0) || (b.dim() == 0)) {
+    optimized = 0;
+  }
+
+  if ((broadcast == 1) && (max_dim > kTensorDimensionLimit)) {
+    optimized = 0;
+  }
+
+  for (int i = 0; i < max_dim; i++) {
+    out_shape[i]  = 1;
+    inp1_shape[i] = 1;
+    inp2_shape[i] = 1;
+  }
+  
+  int offset_out  = max_dim - out.dim();
+  int offset_inp1 = max_dim - a.dim();
+  int offset_inp2 = max_dim - b.dim();
+  
+  for (int i = 0; i < out.dim(); i++) {
+    out_shape[i + offset_out] = out.size(i);
+  }
+  for (int i = 0; i < a.dim(); i++) {
+    inp1_shape[i + offset_inp1] = a.size(i);
+  }
+  for (int i = 0; i < b.dim(); i++) {
+  inp2_shape[i + offset_inp2] = b.size(i); 
+  }
+
+  if ((compute_type == ScalarType::Int) && (optimized)){
     const int* const inp1_data = a.const_data_ptr<int>();
     const int* const inp2_data = b.const_data_ptr<int>();
     int* const out_data = out.mutable_data_ptr<int>();
@@ -117,7 +135,7 @@ Tensor& add_out(
       xa_nn_elm_add_32x32_32(
           out_data, inp1_data, inp2_data, alpha_val, out.numel());
     }
-  } else if (compute_type == ScalarType::Float) {
+  } else if ((compute_type == ScalarType::Float) && (optimized)) {
     const float* const inp1_data = a.const_data_ptr<float>();
     const float* const inp2_data = b.const_data_ptr<float>();
     float* const out_data = out.mutable_data_ptr<float>();
diff --git a/backends/cadence/fusion_g3/operators/op_mul.cpp b/backends/cadence/fusion_g3/operators/op_mul.cpp
index 366982ae3f..82e84bdbe1 100644
--- a/backends/cadence/fusion_g3/operators/op_mul.cpp
+++ b/backends/cadence/fusion_g3/operators/op_mul.cpp
@@ -68,27 +68,45 @@ Tensor& mul_out(
   int inp2_shape[kTensorDimensionLimit];
   int out_shape[kTensorDimensionLimit];
 
-  /* input shapes and output shapes */
-  for (auto i = 0; i < a_size.size(); i++) {
-    inp1_shape[i] = a_size[i];
-  }
-
-  for (auto i = 0; i < b_size.size(); i++) {
-    inp2_shape[i] = b_size[i];
-  }
-
-  for (auto i = 0; i < out_size.size(); i++) {
-    out_shape[i] = out_size[i];
-  }
-
   /*find broadcast*/
   const bool a_is_broadcasted = !out.sizes().equals(a.sizes());
   const bool b_is_broadcasted = !out.sizes().equals(b.sizes());
   const bool broadcast = (a_is_broadcasted || b_is_broadcasted);
 
   int max_dim = a.dim() > b.dim() ? a.dim() : b.dim();
+  max_dim = out.dim() > max_dim ? out.dim() : max_dim;
 
-  if (compute_type == ScalarType::Int) {
+  bool optimized = 1;
+
+  if ((a.dim() == 0) || (b.dim() == 0)) {
+    optimized = 0;
+  }
+
+  if ((broadcast == 1) && (max_dim > kTensorDimensionLimit)) {
+    optimized = 0;
+  }
+
+  for (int i = 0; i < max_dim; i++) {
+    out_shape[i]  = 1;
+    inp1_shape[i] = 1;
+    inp2_shape[i] = 1;
+  }
+  
+  int offset_out  = max_dim - out.dim();
+  int offset_inp1 = max_dim - a.dim();
+  int offset_inp2 = max_dim - b.dim();
+  
+  for (int i = 0; i < out.dim(); i++) {
+    out_shape[i + offset_out] = out.size(i);
+  }
+  for (int i = 0; i < a.dim(); i++) {
+    inp1_shape[i + offset_inp1] = a.size(i);
+  }
+  for (int i = 0; i < b.dim(); i++) {
+  inp2_shape[i + offset_inp2] = b.size(i); 
+  }
+
+  if ((compute_type == ScalarType::Int) && (optimized)) {
     const int* const inp1_data = a.const_data_ptr<int>();
     const int* const inp2_data = b.const_data_ptr<int>();
     int* const out_data = out.mutable_data_ptr<int>();
@@ -105,7 +123,7 @@ Tensor& mul_out(
     } else {
       xa_nn_elm_mul_32x32_32(out_data, inp1_data, inp2_data, out.numel());
     }
-  } else if (compute_type == ScalarType::Float) {
+  } else if ((compute_type == ScalarType::Float) && (optimized)) {
     const float* const inp1_data = a.const_data_ptr<float>();
     const float* const inp2_data = b.const_data_ptr<float>();
     float* const out_data = out.mutable_data_ptr<float>();

From a2619e1dae77af9448b53da5bf9f342e3aa9cc0a Mon Sep 17 00:00:00 2001
From: David Lin <lind@fb.com>
Date: Wed, 27 Nov 2024 10:35:10 -0800
Subject: [PATCH 10/27] Fix lints from HUD (#7110)

run lintrunner

Co-authored-by: lind <lind@devvm053.nha0.facebook.com>
---
 .../cadence/fusion_g3/operators/op_add.cpp    | 12 +++----
 .../cadence/fusion_g3/operators/op_mul.cpp    | 10 +++---
 .../cadence/hifi/operators/op_maximum.cpp     |  1 -
 backends/cadence/hifi/operators/op_pow.cpp    |  1 -
 .../hifi/operators/quantized_linear_out.cpp   | 31 +++++++++----------
 5 files changed, 26 insertions(+), 29 deletions(-)

diff --git a/backends/cadence/fusion_g3/operators/op_add.cpp b/backends/cadence/fusion_g3/operators/op_add.cpp
index 551c6652f1..9537cbacb7 100644
--- a/backends/cadence/fusion_g3/operators/op_add.cpp
+++ b/backends/cadence/fusion_g3/operators/op_add.cpp
@@ -95,15 +95,15 @@ Tensor& add_out(
   }
 
   for (int i = 0; i < max_dim; i++) {
-    out_shape[i]  = 1;
+    out_shape[i] = 1;
     inp1_shape[i] = 1;
     inp2_shape[i] = 1;
   }
-  
-  int offset_out  = max_dim - out.dim();
+
+  int offset_out = max_dim - out.dim();
   int offset_inp1 = max_dim - a.dim();
   int offset_inp2 = max_dim - b.dim();
-  
+
   for (int i = 0; i < out.dim(); i++) {
     out_shape[i + offset_out] = out.size(i);
   }
@@ -111,10 +111,10 @@ Tensor& add_out(
     inp1_shape[i + offset_inp1] = a.size(i);
   }
   for (int i = 0; i < b.dim(); i++) {
-  inp2_shape[i + offset_inp2] = b.size(i); 
+    inp2_shape[i + offset_inp2] = b.size(i);
   }
 
-  if ((compute_type == ScalarType::Int) && (optimized)){
+  if ((compute_type == ScalarType::Int) && (optimized)) {
     const int* const inp1_data = a.const_data_ptr<int>();
     const int* const inp2_data = b.const_data_ptr<int>();
     int* const out_data = out.mutable_data_ptr<int>();
diff --git a/backends/cadence/fusion_g3/operators/op_mul.cpp b/backends/cadence/fusion_g3/operators/op_mul.cpp
index 82e84bdbe1..31cd50314e 100644
--- a/backends/cadence/fusion_g3/operators/op_mul.cpp
+++ b/backends/cadence/fusion_g3/operators/op_mul.cpp
@@ -87,15 +87,15 @@ Tensor& mul_out(
   }
 
   for (int i = 0; i < max_dim; i++) {
-    out_shape[i]  = 1;
+    out_shape[i] = 1;
     inp1_shape[i] = 1;
     inp2_shape[i] = 1;
   }
-  
-  int offset_out  = max_dim - out.dim();
+
+  int offset_out = max_dim - out.dim();
   int offset_inp1 = max_dim - a.dim();
   int offset_inp2 = max_dim - b.dim();
-  
+
   for (int i = 0; i < out.dim(); i++) {
     out_shape[i + offset_out] = out.size(i);
   }
@@ -103,7 +103,7 @@ Tensor& mul_out(
     inp1_shape[i + offset_inp1] = a.size(i);
   }
   for (int i = 0; i < b.dim(); i++) {
-  inp2_shape[i + offset_inp2] = b.size(i); 
+    inp2_shape[i + offset_inp2] = b.size(i);
   }
 
   if ((compute_type == ScalarType::Int) && (optimized)) {
diff --git a/backends/cadence/hifi/operators/op_maximum.cpp b/backends/cadence/hifi/operators/op_maximum.cpp
index f9a3658891..f85d3470e9 100644
--- a/backends/cadence/hifi/operators/op_maximum.cpp
+++ b/backends/cadence/hifi/operators/op_maximum.cpp
@@ -23,7 +23,6 @@ using torch::executor::apply_binary_elementwise_fn;
 using torch::executor::Error;
 using torch::executor::resize_to_broadcast_target_size;
 
-
 namespace cadence {
 namespace impl {
 namespace HiFi {
diff --git a/backends/cadence/hifi/operators/op_pow.cpp b/backends/cadence/hifi/operators/op_pow.cpp
index 9669e96123..1399c24a34 100644
--- a/backends/cadence/hifi/operators/op_pow.cpp
+++ b/backends/cadence/hifi/operators/op_pow.cpp
@@ -351,4 +351,3 @@ Tensor& pow_Scalar_out(
 } // namespace HiFi
 } // namespace impl
 } // namespace cadence
-
diff --git a/backends/cadence/hifi/operators/quantized_linear_out.cpp b/backends/cadence/hifi/operators/quantized_linear_out.cpp
index accc610132..b8e1d117fb 100644
--- a/backends/cadence/hifi/operators/quantized_linear_out.cpp
+++ b/backends/cadence/hifi/operators/quantized_linear_out.cpp
@@ -26,8 +26,7 @@ using ::executorch::aten::Tensor;
 using ::executorch::runtime::getLeadingDims;
 using ::executorch::runtime::KernelRuntimeContext;
 
-
-    // The nnlib kernel to compute quantized linear via matmul.
+// The nnlib kernel to compute quantized linear via matmul.
 
 void _quantized_linear_asym8u(
     const Tensor& in,
@@ -48,22 +47,22 @@ void _quantized_linear_asym8u(
   const int32_t* __restrict__ bias_data = bias.const_data_ptr<int32_t>();
   uint8_t* __restrict__ out_data = out.mutable_data_ptr<uint8_t>();
   int32_t ret = xa_nn_matmul_asym8uxasym8u_asym8u(
-        out_data,
-        weight_data,
-        in_data,
-        bias_data,
-        out_dim,
-        in_dim,
-        in_dim,
-        leading_dims,
-        in_dim,
-        out_dim,
-        1,
+      out_data,
+      weight_data,
+      in_data,
+      bias_data,
+      out_dim,
+      in_dim,
+      in_dim,
+      leading_dims,
+      in_dim,
+      out_dim,
+      1,
       -weight_zero_point.const_data_ptr<int32_t>()[0], // mat1_zero_bias
       -in_zero_point, // mat2_zero_bias
-        out_multiplier.const_data_ptr<int32_t>()[0],
-        out_shift.const_data_ptr<int32_t>()[0],
-        out_zero_point);
+      out_multiplier.const_data_ptr<int32_t>()[0],
+      out_shift.const_data_ptr<int32_t>()[0],
+      out_zero_point);
   ET_DCHECK_MSG(ret == 0, "HiFi quantized::linear failed");
 }
 

From 8b375f25332527a4fb8385839afb01f9d9df260d Mon Sep 17 00:00:00 2001
From: Hansong <107070759+kirklandsign@users.noreply.github.com>
Date: Wed, 27 Nov 2024 11:04:14 -0800
Subject: [PATCH 11/27] Fix pyre in arm_backend.py

Differential Revision: D66475070

Pull Request resolved: https://github.com/pytorch/executorch/pull/7069
---
 backends/arm/arm_backend.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/backends/arm/arm_backend.py b/backends/arm/arm_backend.py
index 59473a9e6d..c59eedc304 100644
--- a/backends/arm/arm_backend.py
+++ b/backends/arm/arm_backend.py
@@ -135,7 +135,9 @@ def set_quantize_io(self, quantize_io: bool = False) -> "ArmCompileSpecBuilder":
         self.quantize_io = quantize_io
         return self
 
-    def set_input_order(self, input_order: str = None) -> "ArmCompileSpecBuilder":
+    def set_input_order(
+        self, input_order: Optional[str] = None
+    ) -> "ArmCompileSpecBuilder":
         """
         Reorder the inputs coming in. This may be required when inputs > 1.
         And while using the U55/U85 CompileSpec.

From 27638c33e7e70d15eb4b495a860c43d5f37c4dae Mon Sep 17 00:00:00 2001
From: pytorchbot <soumith+bot@pytorch.org>
Date: Wed, 27 Nov 2024 12:02:10 -0800
Subject: [PATCH 12/27] move rope related logic together (#7113)

Pull Request resolved: https://github.com/pytorch/executorch/pull/6560

Right now, rope related code scatters around a few different places in `llama_transformer`. It makes it hard to make changes to rope related things.

This PR moves all rope related logic into its own module.
ghstack-source-id: 255543205

Differential Revision: [D65173598](https://our.internmc.facebook.com/intern/diff/D65173598/)

Co-authored-by: Lunwen He <lwhecser@gmail.com>
---
 examples/models/llama/llama_transformer.py    | 139 +++++++++++-------
 .../llama/source_transformation/rope.py       |  28 ++--
 2 files changed, 101 insertions(+), 66 deletions(-)

diff --git a/examples/models/llama/llama_transformer.py b/examples/models/llama/llama_transformer.py
index 3f8b8dd654..10d660d37a 100644
--- a/examples/models/llama/llama_transformer.py
+++ b/examples/models/llama/llama_transformer.py
@@ -147,6 +147,81 @@ def __post_init__(self):
             self.head_dim = self.dim // self.n_heads
 
 
+class Rope(torch.nn.Module):
+    def __init__(self, params: ModelArgs):
+        super().__init__()
+        self.params = params
+        if self.params.use_hf_rope:
+            self.precompute_freqs_cis = hf_precompute_freqs_cis
+        else:
+            self.precompute_freqs_cis = partial(
+                precompute_freqs_cis, use_scaled=self.params.use_scaled_rope
+            )
+        freqs_cos, freqs_sin = self.precompute_freqs_cis(
+            self.params.head_dim,
+            (
+                self.params.max_seq_len  # Normal llama2.
+                if self.params.ffn_dim_multiplier is None
+                else self.params.max_seq_len * 2  # Sharded checkpoint.
+            ),
+            self.params.rope_freq_base,
+        )
+        self.register_buffer("freqs_cos", freqs_cos, persistent=False)
+        self.register_buffer("freqs_sin", freqs_sin, persistent=False)
+        if self.params.use_hf_rope:
+            self.apply_rotary_emb = hf_apply_rotary_emb
+        else:
+            self.apply_rotary_emb = RotaryEmbedding()
+
+    def forward(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        freqs_cos: torch.Tensor,
+        freqs_sin: torch.Tensor,
+    ):
+        return self.apply_rotary_emb(q, k, freqs_cos, freqs_sin)
+
+    def get_freqs(self, input_pos: Optional[torch.Tensor], seq_len: int):
+        """
+        Get the precomputed frequencies for the given input position and sequence length.
+
+        Args:
+            input_pos (torch.Tensor): The input position tensor.
+            seq_len (int): The sequence length.
+
+        Returns:
+            Tuple[torch.Tensor, torch.Tensor]: The precomputed frequencies for the given input position and sequence length.
+        """
+        if self.params.use_kv_cache:
+            assert (
+                input_pos is not None
+            ), "input_pos must be provided when use_kv_cache is True"
+
+            if self.params.enable_dynamic_shape:
+                # when KV cache is used, seqlen is most likely 1. We want to slice from the start_pos.
+                input_pos_item = input_pos[-1].item()
+                torch._check_is_size(input_pos_item)
+                torch._check(input_pos_item < self.params.max_seq_len)
+                # pyre-ignore: Incompatible parameter type [6]: torch.narrow does expect int or Tensor
+                freqs_cos = self.freqs_cos.narrow(0, input_pos_item, seq_len)
+                # pyre-ignore: Incompatible parameter type [6]
+                freqs_sin = self.freqs_sin.narrow(0, input_pos_item, seq_len)
+            else:
+                # When not using dynamic shape, use of the .item results in
+                # symints, due to querying the data from tensor.
+                # this path avoids that for mps backend, although probably mps backend
+                # can support dynamic shape?
+                freqs_cos = self.freqs_cos[input_pos]
+                freqs_sin = self.freqs_sin[input_pos]
+
+        else:
+            assert input_pos is None, "input_pos is unused when use_kv_cache is False"
+            freqs_cos = self.freqs_cos[:seq_len]
+            freqs_sin = self.freqs_sin[:seq_len]
+        return freqs_cos, freqs_sin
+
+
 class KVCache(nn.Module):
     def __init__(
         self,
@@ -266,7 +341,7 @@ def forward(
 
 
 class Attention(nn.Module):
-    def __init__(self, args: ModelArgs, layer_id: int):
+    def __init__(self, args: ModelArgs, layer_id: int, rope: Rope):
         super().__init__()
         self.use_kv_cache = args.use_kv_cache
         self.n_heads = args.n_heads
@@ -287,6 +362,8 @@ def __init__(self, args: ModelArgs, layer_id: int):
 
         self.layer_id = layer_id
 
+        self.rope = rope
+
         causal_mask = torch.tril(
             torch.ones(
                 self.max_seq_len,
@@ -303,7 +380,7 @@ def __init__(self, args: ModelArgs, layer_id: int):
                 args.max_seq_len,
                 self.n_kv_heads,
                 self.head_dim,
-                not args.use_sdpa_with_kv_cache_op,  # if we are using the custom op dont transpose the cache. Expect untransposed q k v
+                not args.use_sdpa_with_kv_cache_op,  # if we are using the custom op don't transpose the cache. Expect untransposed q k v
                 args.enable_dynamic_shape,
             )
             self.SDPA = SDPA(
@@ -314,10 +391,6 @@ def __init__(self, args: ModelArgs, layer_id: int):
                 max_seq_len=self.max_seq_len,
                 enable_dynamic_shape=args.enable_dynamic_shape,
             )
-        if args.use_hf_rope:
-            self.apply_rotary_emb = hf_apply_rotary_emb
-        else:
-            self.apply_rotary_emb = RotaryEmbedding()
 
     def forward(
         self,
@@ -336,7 +409,7 @@ def forward(
         v = v.view(bsz, seqlen, self.n_local_kv_heads, self.head_dim)
 
         # RoPE relative positional embeddings
-        q, k = self.apply_rotary_emb(q, k, freqs_cos, freqs_sin)
+        q, k = self.rope.forward(q, k, freqs_cos, freqs_sin)
 
         if self.use_kv_cache:
             assert input_pos is not None
@@ -424,13 +497,13 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
 
 
 class TransformerBlock(nn.Module):
-    def __init__(self, layer_id: int, args: ModelArgs):
+    def __init__(self, layer_id: int, args: ModelArgs, rope: Rope):
         super().__init__()
         self.use_kv_cache = args.use_kv_cache
         self.n_heads = args.n_heads
         self.dim = args.dim
         self.head_dim = args.head_dim
-        self.attention = Attention(args, layer_id)
+        self.attention = Attention(args, layer_id, rope)
         if args.moe:
             self.block_sparse_moe = MOEFeedForward(args)
         else:
@@ -459,9 +532,10 @@ def __init__(self, params: ModelArgs):
         self.n_layers = params.n_layers
 
         self.tok_embeddings = nn.Embedding(params.vocab_size, params.dim)
+        self.rope = Rope(params)
         self.layers = torch.nn.ModuleList()
         for layer_id in range(params.n_layers):
-            self.layers.append(TransformerBlock(layer_id, params))
+            self.layers.append(TransformerBlock(layer_id, params, self.rope))
         self.norm = RMSNorm(params.dim, eps=params.norm_eps)
         self.output = nn.Linear(params.dim, params.vocab_size, bias=False)
         self.use_kv_cache = params.use_kv_cache
@@ -469,23 +543,6 @@ def __init__(self, params: ModelArgs):
         self.max_seq_len = params.max_seq_len
         self.input_prune_map = params.input_prune_map
         self.output_prune_map = params.output_prune_map
-        if params.use_hf_rope:
-            self.precompute_freqs_cis = hf_precompute_freqs_cis
-        else:
-            self.precompute_freqs_cis = partial(
-                precompute_freqs_cis, use_scaled=params.use_scaled_rope
-            )
-        freqs_cos, freqs_sin = self.precompute_freqs_cis(
-            params.head_dim,
-            (
-                params.max_seq_len  # Normal llama2.
-                if params.ffn_dim_multiplier is None
-                else params.max_seq_len * 2  # Sharded checkpoint.
-            ),
-            params.rope_freq_base,
-        )
-        self.register_buffer("freqs_cos", freqs_cos, persistent=False)
-        self.register_buffer("freqs_sin", freqs_sin, persistent=False)
 
     def forward(
         self,
@@ -502,33 +559,7 @@ def forward(
         if tokens is not None and h is None:
             h = self.tok_embeddings(tokens)
         seqlen = h.shape[1]
-
-        if self.use_kv_cache:
-            assert (
-                input_pos is not None
-            ), "input_pos must be provided when use_kv_cache is True"
-
-            if self.params.enable_dynamic_shape:
-                # when KV cache is used, seqlen is most likely 1. We want to slice from the start_pos.
-                input_pos_item = input_pos[-1].item()
-                torch._check_is_size(input_pos_item)
-                torch._check(input_pos_item < self.params.max_seq_len)
-                # pyre-ignore: Incompatible parameter type [6]: torch.narrow does expect int or Tensor
-                freqs_cos = self.freqs_cos.narrow(0, input_pos_item, seqlen)
-                # pyre-ignore: Incompatible parameter type [6]
-                freqs_sin = self.freqs_sin.narrow(0, input_pos_item, seqlen)
-            else:
-                # When not using dynamic shape, use of the .item results in
-                # symints, due to querying the data from tensor.
-                # this path avoids that for mps backend, although probably mps backend
-                # can support dynamic shape?
-                freqs_cos = self.freqs_cos[input_pos]
-                freqs_sin = self.freqs_sin[input_pos]
-
-        else:
-            assert input_pos is None, "input_pos is unused when use_kv_cache is False"
-            freqs_cos = self.freqs_cos[:seqlen]
-            freqs_sin = self.freqs_sin[:seqlen]
+        freqs_cos, freqs_sin = self.rope.get_freqs(input_pos, seqlen)
 
         for layer in self.layers:
             h = layer(
diff --git a/examples/models/llama/source_transformation/rope.py b/examples/models/llama/source_transformation/rope.py
index a2a2264b24..79fb239966 100644
--- a/examples/models/llama/source_transformation/rope.py
+++ b/examples/models/llama/source_transformation/rope.py
@@ -13,23 +13,27 @@ def materialze_broadcast_of_rope_freq_cis(
     module: torch.nn.Module,
 ):
     assert isinstance(module, Transformer)
-    assert module.freqs_cos.dim() == 2
-    dim0 = module.freqs_cos.size(0)
-    dim1 = module.freqs_cos.size(1)
+    assert module.rope.freqs_cos.dim() == 2
+    dim0 = module.rope.freqs_cos.size(0)
+    dim1 = module.rope.freqs_cos.size(1)
     module_attention = module.layers[0].attention
     assert (
         module_attention.n_local_kv_heads == module_attention.n_local_heads
     ), f"For rope freqs to be materialized for broadcast, q, k, v num heads must match. For q got {module_attention.n_kv_heads} for k got {module_attention.n_local_heads} and v got {module_attention.n_local_kv_heads}"
     num_heads = module_attention.n_local_heads
-    module.freqs_cos = module.freqs_cos.view(dim0, 1, dim1)
-    module.freqs_cos = module.freqs_cos.expand(dim0, num_heads, dim1).contiguous()
-    assert module.freqs_sin.dim() == 2
-    assert dim0 == module.freqs_sin.size(
+    module.rope.freqs_cos = module.rope.freqs_cos.view(dim0, 1, dim1)
+    module.rope.freqs_cos = module.rope.freqs_cos.expand(
+        dim0, num_heads, dim1
+    ).contiguous()
+    assert module.rope.freqs_sin.dim() == 2
+    assert dim0 == module.rope.freqs_sin.size(
         0
-    ), f"sin and cos freq table sizes must match. Mismatch found at dim 0: {dim0} vs {module.freqs_sin.size(0)}"
-    assert dim1 == module.freqs_sin.size(
+    ), f"sin and cos freq table sizes must match. Mismatch found at dim 0: {dim0} vs {module.rope.freqs_sin.size(0)}"
+    assert dim1 == module.rope.freqs_sin.size(
         1
-    ), f"sin and cos freq table sizes must match. Mismatch found at dim 1: {dim1} vs {module.freqs_sin.size(1)}"
-    module.freqs_sin = module.freqs_sin.view(dim0, 1, dim1)
-    module.freqs_sin = module.freqs_sin.expand(dim0, num_heads, dim1).contiguous()
+    ), f"sin and cos freq table sizes must match. Mismatch found at dim 1: {dim1} vs {module.rope.freqs_sin.size(1)}"
+    module.rope.freqs_sin = module.rope.freqs_sin.view(dim0, 1, dim1)
+    module.rope.freqs_sin = module.rope.freqs_sin.expand(
+        dim0, num_heads, dim1
+    ).contiguous()
     return module

From 6b738410e400b173fdda78bea352aa5eb334e751 Mon Sep 17 00:00:00 2001
From: pytorchbot <soumith+bot@pytorch.org>
Date: Wed, 27 Nov 2024 12:03:49 -0800
Subject: [PATCH 13/27] implement position encoding for shifted tokens

Pull Request resolved: https://github.com/pytorch/executorch/pull/6646

In AttentionSink, it uses tokens' positions in the KVCache instead of the actual text. When tokens get shifted in KVCache, it needs to update q and k's position embedding.

In the original [implementation](https://github.com/mit-han-lab/streaming-llm) of AttentionSink with Rope, it caches the original q and k in KVCache and apply position embedding during inference.

This PR adds `RopeWithAttentionSink`. It assumes that q and k are already encoded with their original position. When we shift tokens, we reapply the position delta. This has two benefits:
- minimize our code since our existing `llama_transformer` applies rope embedding before doing KVCache update
- avoid performance regression when tokens are not shifted because we don't need to reapply position encoding in KVCache for them
ghstack-source-id: 255579838

Differential Revision: [D65366440](https://our.internmc.facebook.com/intern/diff/D65366440/)

---------

Co-authored-by: Lunwen He <lwhecser@gmail.com>
---
 examples/models/llama/TARGETS                 | 14 ++++
 examples/models/llama/rope.py                 | 41 +++++++++++
 .../source_transformation/attention_sink.py   | 62 ++++++++++++++++
 .../test_attention_sink.py                    | 73 +++++++++++++++++++
 4 files changed, 190 insertions(+)
 create mode 100644 examples/models/llama/source_transformation/attention_sink.py
 create mode 100644 examples/models/llama/source_transformation/test_attention_sink.py

diff --git a/examples/models/llama/TARGETS b/examples/models/llama/TARGETS
index cf387bfab2..284520d4d5 100644
--- a/examples/models/llama/TARGETS
+++ b/examples/models/llama/TARGETS
@@ -93,6 +93,7 @@ runtime.python_library(
         "source_transformation/sdpa.py",
         "source_transformation/spin_quant.py",
         "source_transformation/vulkan_rope.py",
+        "source_transformation/attention_sink.py",
     ],
     _is_external_target = True,
     base_module = "executorch.examples.models.llama",
@@ -213,3 +214,16 @@ runtime.python_test(
         "//executorch/examples/models/llama:llama_transformer",
     ],
 )
+
+runtime.python_test(
+    name = "attention_sink_test",
+    srcs = [
+        "source_transformation/test_attention_sink.py",
+    ],
+    supports_static_listing = False,
+    deps = [
+        "fbsource//third-party/pypi/parameterized:parameterized",
+        "//caffe2:torch",
+        ":export_library",
+    ],
+)
diff --git a/examples/models/llama/rope.py b/examples/models/llama/rope.py
index 0383c79898..1445787f5e 100644
--- a/examples/models/llama/rope.py
+++ b/examples/models/llama/rope.py
@@ -92,6 +92,22 @@ def apply_rotary_emb(
     return xq_out.type_as(xq), xk_out.type_as(xk)
 
 
+def apply_rotary_emb_to_k(
+    xk: torch.Tensor, freqs_cos: torch.Tensor, freqs_sin: torch.Tensor
+) -> torch.Tensor:
+    xk_r, xk_i = xk.float().reshape(xk.shape[:-1] + (-1, 2)).unbind(-1)
+
+    freqs_cos = reshape_for_broadcast(freqs_cos, xk_r)
+    freqs_sin = reshape_for_broadcast(freqs_sin, xk_r)
+
+    xk_out_r = xk_r * freqs_cos - xk_i * freqs_sin
+    xk_out_i = xk_r * freqs_sin + xk_i * freqs_cos
+
+    xk_out = torch.stack([xk_out_r, xk_out_i], dim=-1).flatten(3)
+
+    return xk_out.type_as(xk)
+
+
 class RotaryEmbedding(torch.nn.Module):
     def __init__(self):
         super().__init__()
@@ -160,3 +176,28 @@ def hf_apply_rotary_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
     q_embed = (q * cos) + (rotate_half(q) * sin)
     k_embed = (k * cos) + (rotate_half(k) * sin)
     return q_embed, k_embed
+
+
+def hf_apply_rotary_emb_to_k(k, cos, sin, position_ids=None, unsqueeze_dim=1):
+    """Applies Rotary Position Embedding to the key tensors.
+
+    Args:
+        k (`torch.Tensor`): The key tensor.
+        cos (`torch.Tensor`): The cosine part of the rotary embedding.
+        sin (`torch.Tensor`): The sine part of the rotary embedding.
+        position_ids (`torch.Tensor`, *optional*):
+            Deprecated and unused.
+        unsqueeze_dim (`int`, *optional*, defaults to 1):
+            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+            sin[position_ids] so that they can be properly broadcasted to the dimensions of k. For example, note
+            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if
+            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+            cos[position_ids] and sin[position_ids] broadcastable to the shapes of k. Similarly, if k have
+            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+    Returns:
+        `torch.Tensor` the key tensor rotated using the Rotary Position Embedding.
+    """
+    cos = cos.unsqueeze(unsqueeze_dim)
+    sin = sin.unsqueeze(unsqueeze_dim)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return k_embed
diff --git a/examples/models/llama/source_transformation/attention_sink.py b/examples/models/llama/source_transformation/attention_sink.py
new file mode 100644
index 0000000000..94f5b47871
--- /dev/null
+++ b/examples/models/llama/source_transformation/attention_sink.py
@@ -0,0 +1,62 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# Components for supporting Attention Sink. See
+# https://arxiv.org/abs/2309.17453 for more details about Attention Sink.
+
+import torch
+
+from executorch.examples.models.llama.llama_transformer import ModelArgs, Rope
+from executorch.examples.models.llama.rope import (
+    apply_rotary_emb_to_k,
+    hf_apply_rotary_emb_to_k,
+)
+
+
+class RopeWithAttentionSink(Rope):
+    """
+    Rope that helps adjust position encoding when tokens are shifted in KVCache.
+    For AttentionSink, when tokens are shifted in KVCache, we need to use positions
+    in KVCache instead of positions in the actual text.
+    """
+
+    def __init__(self, params: ModelArgs):
+        super().__init__(params)
+        if self.params.use_hf_rope:
+            self.apply_rotary_emb_to_k = hf_apply_rotary_emb_to_k
+        else:
+            self.apply_rotary_emb_to_k = apply_rotary_emb_to_k
+
+    def rerotate_k(
+        self,
+        k: torch.Tensor,
+        original_position: int,
+        new_position: int,
+    ):
+        """
+        Rerotate k from original_position to new_position. This is done by rerotating
+        k with (new_position * theta - original_position * theta) with the following matrix:
+        (cos(delta), -sin(delta)
+         sin(delta), cos(delta))
+         where delta = new_position * theta - original_position * theta
+
+         The shape of k is (batch_size, seq_len, n_local_heads, head_dim)
+
+         Based on https://github.com/huggingface/transformers/blame/main/src/transformers/cache_utils.py#L961
+        """
+        seq_len = k.shape[1]
+        original_freqs_cos = self.freqs_cos.narrow(0, original_position, seq_len)
+        original_freqs_sin = self.freqs_sin.narrow(0, original_position, seq_len)
+        new_freqs_cos = self.freqs_cos.narrow(0, new_position, seq_len)
+        new_freqs_sin = self.freqs_sin.narrow(0, new_position, seq_len)
+        rerotation_cos = (
+            new_freqs_cos * original_freqs_cos + new_freqs_sin * original_freqs_sin
+        )
+        rerotation_sin = (
+            new_freqs_sin * original_freqs_cos - new_freqs_cos * original_freqs_sin
+        )
+
+        return self.apply_rotary_emb_to_k(k, rerotation_cos, rerotation_sin)
diff --git a/examples/models/llama/source_transformation/test_attention_sink.py b/examples/models/llama/source_transformation/test_attention_sink.py
new file mode 100644
index 0000000000..adb3bff3a5
--- /dev/null
+++ b/examples/models/llama/source_transformation/test_attention_sink.py
@@ -0,0 +1,73 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import unittest
+
+import torch
+from executorch.examples.models.llama.llama_transformer import ModelArgs
+
+from executorch.examples.models.llama.source_transformation.attention_sink import (
+    RopeWithAttentionSink,
+)
+from parameterized import parameterized
+
+
+class RopeWithAttentionSinkTest(unittest.TestCase):
+
+    def setUp(self):
+        torch.manual_seed(42)
+        self.params = ModelArgs(use_kv_cache=True, enable_dynamic_shape=True)
+        self.rope_with_attention_sink = RopeWithAttentionSink(params=self.params)
+
+    @parameterized.expand(
+        [
+            [128, 127],  # Rotate left
+            [128, 128],  # No rotation
+            [128, 129],  # Rotate right
+        ]
+    )
+    def test_rotate(self, original_position, new_position):
+        seq_len = 32
+
+        q = torch.rand(
+            1, seq_len, self.params.n_heads, self.params.head_dim, dtype=torch.float32
+        )
+        k = torch.rand(
+            1,
+            seq_len,
+            self.params.n_heads,
+            self.params.head_dim,
+            dtype=torch.float32,
+        )
+        freqs_cos, freqs_sin = self.rope_with_attention_sink.get_freqs(
+            input_pos=torch.tensor([original_position], dtype=torch.int32),
+            seq_len=seq_len,
+        )
+        _, pre_rotated_k = self.rope_with_attention_sink.forward(
+            q=q,
+            k=k,
+            freqs_cos=freqs_cos,
+            freqs_sin=freqs_sin,
+        )
+
+        rerotated_k = self.rope_with_attention_sink.rerotate_k(
+            k=pre_rotated_k,
+            original_position=original_position,
+            new_position=new_position,
+        )
+
+        freqs_cos, freqs_sin = self.rope_with_attention_sink.get_freqs(
+            input_pos=torch.tensor([new_position], dtype=torch.int32),
+            seq_len=seq_len,
+        )
+        _, expected_k = self.rope_with_attention_sink.forward(
+            q=q,
+            k=k,
+            freqs_cos=freqs_cos,
+            freqs_sin=freqs_sin,
+        )
+
+        torch.testing.assert_close(rerotated_k, expected_k)

From c726a9bf545f7721f7861aacda373775c1caa4c5 Mon Sep 17 00:00:00 2001
From: pytorchbot <soumith+bot@pytorch.org>
Date: Wed, 27 Nov 2024 12:06:49 -0800
Subject: [PATCH 14/27] Implement get_freqs for RopeWithAttentionSink

This PR implements the `get_freqs` function for `RopeWithAttentionSink`. It returns the `freqs_cos` and `freqs_sin` for given `input_pos` and `seq_len` after shifting tokens in the pre-computed `freqs_cos` and `freq_sin`.

Differential Revision: [D66525306](https://our.internmc.facebook.com/intern/diff/D66525306/)

ghstack-source-id: 255582545
Pull Request resolved: https://github.com/pytorch/executorch/pull/7100

Co-authored-by: Lunwen He <lwhecser@gmail.com>
---
 .../source_transformation/attention_sink.py   | 29 ++++++++++-
 .../test_attention_sink.py                    | 51 ++++++++++++++++++-
 2 files changed, 77 insertions(+), 3 deletions(-)

diff --git a/examples/models/llama/source_transformation/attention_sink.py b/examples/models/llama/source_transformation/attention_sink.py
index 94f5b47871..8f4fd1ebd2 100644
--- a/examples/models/llama/source_transformation/attention_sink.py
+++ b/examples/models/llama/source_transformation/attention_sink.py
@@ -7,6 +7,8 @@
 # Components for supporting Attention Sink. See
 # https://arxiv.org/abs/2309.17453 for more details about Attention Sink.
 
+from typing import Optional
+
 import torch
 
 from executorch.examples.models.llama.llama_transformer import ModelArgs, Rope
@@ -23,12 +25,37 @@ class RopeWithAttentionSink(Rope):
     in KVCache instead of positions in the actual text.
     """
 
-    def __init__(self, params: ModelArgs):
+    def __init__(
+        self,
+        params: ModelArgs,
+        window_size: int,
+        sink_size: int,
+        eviction_batch_size: int,
+    ):
         super().__init__(params)
         if self.params.use_hf_rope:
             self.apply_rotary_emb_to_k = hf_apply_rotary_emb_to_k
         else:
             self.apply_rotary_emb_to_k = apply_rotary_emb_to_k
+        self.max_seq_length = window_size + sink_size
+        assert self.max_seq_length == self.params.max_seq_len
+        self.eviction_batch_size = eviction_batch_size
+        self.position_shift = 0
+
+    def get_freqs(self, input_pos: Optional[torch.Tensor], seq_len: int):
+        assert input_pos is not None
+
+        input_pos_item = input_pos.item()
+        torch._check_is_size(input_pos_item)
+        if input_pos_item + self.position_shift + seq_len > self.max_seq_length:
+            # There are not enough spaces in the cache to store the new tokens.
+            # We need to evict some old tokens and shift some recent tokens.
+            num_to_evict = max(
+                input_pos_item + self.position_shift - self.max_seq_length + seq_len,
+                self.eviction_batch_size,
+            )
+            self.position_shift -= num_to_evict  # pyre-ignore [8]
+        return super().get_freqs(input_pos + self.position_shift, seq_len)
 
     def rerotate_k(
         self,
diff --git a/examples/models/llama/source_transformation/test_attention_sink.py b/examples/models/llama/source_transformation/test_attention_sink.py
index adb3bff3a5..8eaa992dc3 100644
--- a/examples/models/llama/source_transformation/test_attention_sink.py
+++ b/examples/models/llama/source_transformation/test_attention_sink.py
@@ -17,10 +17,57 @@
 
 class RopeWithAttentionSinkTest(unittest.TestCase):
 
+    def _init_rope(self, params: ModelArgs, eviction_batch_size: int):
+        return RopeWithAttentionSink(
+            params=params,
+            window_size=252,
+            sink_size=4,
+            eviction_batch_size=eviction_batch_size,
+        )
+
     def setUp(self):
         torch.manual_seed(42)
-        self.params = ModelArgs(use_kv_cache=True, enable_dynamic_shape=True)
-        self.rope_with_attention_sink = RopeWithAttentionSink(params=self.params)
+        self.params = ModelArgs(
+            use_kv_cache=True, enable_dynamic_shape=True, max_seq_len=256
+        )
+        self.rope_with_attention_sink = self._init_rope(
+            params=self.params, eviction_batch_size=1
+        )
+
+    @parameterized.expand(
+        [
+            [0, 10, 1, 0],  # No shift
+            [250, 10, 1, 246],  # Some shift
+            [256, 10, 1, 246],  # All shift
+            [0, 10, 30, 0],  # No shift with batch eviction
+            [250, 10, 30, 220],  # Some shift with batch eviction
+            [256, 10, 30, 226],  # All shift with batch eviction
+        ]
+    )
+    def test_get_freqs(
+        self, input_pos, seq_len, eviction_batch_size, expected_result_pos
+    ):
+        self.rope_with_attention_sink = self._init_rope(
+            params=self.params, eviction_batch_size=eviction_batch_size
+        )
+
+        freqs_cos, freqs_sin = self.rope_with_attention_sink.get_freqs(
+            input_pos=torch.tensor([input_pos], dtype=torch.int32),
+            seq_len=seq_len,
+        )
+
+        torch.testing.assert_close(
+            freqs_cos,
+            self.rope_with_attention_sink.freqs_cos.narrow(
+                0, expected_result_pos, seq_len
+            ),
+        )
+        torch.testing.assert_close(
+            freqs_sin,
+            self.rope_with_attention_sink.freqs_sin.narrow(
+                0, expected_result_pos, seq_len
+            ),
+        )
 
     @parameterized.expand(
         [

From 3a0e5273b7b858b45e6a95b174ec1c73208d206d Mon Sep 17 00:00:00 2001
From: Hansong <107070759+kirklandsign@users.noreply.github.com>
Date: Wed, 27 Nov 2024 16:27:53 -0800
Subject: [PATCH 15/27] Fix cadence BUCK deps and pyre

Differential Revision: D66553586

Pull Request resolved: https://github.com/pytorch/executorch/pull/7116
---
 backends/cadence/runtime/TARGETS          |  1 +
 backends/cadence/runtime/runtime.py       |  3 ---
 backends/cadence/runtime/utils.py         | 16 +++-------------
 examples/cadence/operators/test_add_op.py |  2 ++
 extension/llm/export/quantizer_lib.py     |  2 --
 5 files changed, 6 insertions(+), 18 deletions(-)

diff --git a/backends/cadence/runtime/TARGETS b/backends/cadence/runtime/TARGETS
index db3fe0ad1e..95a7bdc369 100644
--- a/backends/cadence/runtime/TARGETS
+++ b/backends/cadence/runtime/TARGETS
@@ -18,6 +18,7 @@ python_library(
         "//executorch/devtools/bundled_program:config",
         "//executorch/devtools/bundled_program:core",
         "//executorch/devtools/bundled_program/serialize:lib",
+        "//executorch/devtools:lib",
         "//executorch/exir:lib",
     ],
 )
diff --git a/backends/cadence/runtime/runtime.py b/backends/cadence/runtime/runtime.py
index bf2932d9c7..0268931c40 100644
--- a/backends/cadence/runtime/runtime.py
+++ b/backends/cadence/runtime/runtime.py
@@ -167,9 +167,7 @@ def run(
 
 
 def compare(
-    # pyre-fixme[2]: Parameter annotation cannot be `Any`.
     outputs: Any,
-    # pyre-fixme[2]: Parameter annotation cannot be `Any`.
     ref_outputs: Any,
     name: str = "",
     eps_error: float = 1e-1,
@@ -223,7 +221,6 @@ def run_and_compare(
     compare(outputs, ref_outputs, eps_error=eps_error, eps_warn=eps_warn)
 
 
-# pyre-fixme[24]: Generic type `np.ndarray` expects 2 type parameters.
 def to_nd_array(v: Union[bool, numbers.Number, ndarray, torch.Tensor]) -> np.ndarray:
     if isinstance(v, np.ndarray):
         return v
diff --git a/backends/cadence/runtime/utils.py b/backends/cadence/runtime/utils.py
index b3ed622e8b..0a85b6dd61 100644
--- a/backends/cadence/runtime/utils.py
+++ b/backends/cadence/runtime/utils.py
@@ -13,12 +13,11 @@
 import torch
 
 
-# pyre-fixme[24]: Generic type `np.ndarray` expects 2 type parameters.
-def distance(fn: Callable[[np.ndarray, np.ndarray], float]) -> Callable[
+def distance(
+    fn: Callable[[np.ndarray, np.ndarray], float],
+) -> Callable[
     [
-        # pyre-fixme[24]: Generic type `np.ndarray` expects 2 type parameters.
         typing.Union[np.ndarray, torch._tensor.Tensor],
-        # pyre-fixme[24]: Generic type `np.ndarray` expects 2 type parameters.
         typing.Union[np.ndarray, torch._tensor.Tensor],
     ],
     float,
@@ -27,9 +26,7 @@ def distance(fn: Callable[[np.ndarray, np.ndarray], float]) -> Callable[
     # the distance between two N-D tensors given a function. This can be a RMS
     # function, maximum abs diff, or any kind of distance function.
     def wrapper(
-        # pyre-fixme[24]: Generic type `np.ndarray` expects 2 type parameters.
         a: Union[np.ndarray, torch.Tensor],
-        # pyre-fixme[24]: Generic type `np.ndarray` expects 2 type parameters.
         b: Union[np.ndarray, torch.Tensor],
     ) -> float:
         # convert a and b to np.ndarray type fp64
@@ -68,24 +65,20 @@ def wrapper(
 
 
 @distance
-# pyre-fixme[24]: Generic type `np.ndarray` expects 2 type parameters.
 def rms(a: np.ndarray, b: np.ndarray) -> float:
     return ((a - b) ** 2).mean() ** 0.5
 
 
 @distance
-# pyre-fixme[24]: Generic type `np.ndarray` expects 2 type parameters.
 def max_abs_diff(a: np.ndarray, b: np.ndarray) -> float:
     return np.abs(a - b).max()
 
 
 @distance
-# pyre-fixme[24]: Generic type `np.ndarray` expects 2 type parameters.
 def max_rel_diff(x: np.ndarray, x_ref: np.ndarray) -> float:
     return np.abs((x - x_ref) / x_ref).max()
 
 
-# pyre-fixme[24]: Generic type `np.ndarray` expects 2 type parameters.
 def to_np_arr_fp64(x: Union[np.ndarray, torch.Tensor]) -> np.ndarray:
     if isinstance(x, torch.Tensor):
         x = x.detach().cpu().numpy()
@@ -94,11 +87,8 @@ def to_np_arr_fp64(x: Union[np.ndarray, torch.Tensor]) -> np.ndarray:
     return x
 
 
-# pyre-fixme[3]: Return type must be annotated.
 def normalized_rms(
-    # pyre-fixme[24]: Generic type `np.ndarray` expects 2 type parameters.
     predicted: Union[np.ndarray, torch.Tensor],
-    # pyre-fixme[24]: Generic type `np.ndarray` expects 2 type parameters.
     ground_truth: Union[np.ndarray, torch.Tensor],
 ):
     num = rms(predicted, ground_truth)
diff --git a/examples/cadence/operators/test_add_op.py b/examples/cadence/operators/test_add_op.py
index 5481540b4f..7799fe624b 100644
--- a/examples/cadence/operators/test_add_op.py
+++ b/examples/cadence/operators/test_add_op.py
@@ -13,6 +13,7 @@
 
 
 class ATenOpTestCases(unittest.TestCase):
+    # pyre-fixme[16]: Module `parameterized.parameterized` has no attribute `expand`.
     @parameterized.expand(
         [
             [(7, 5, 6), (7, 5, 6)],
@@ -61,6 +62,7 @@ def forward(self, x: torch.Tensor, y: torch.Tensor):
             model, (X, Y), file_name=self._testMethodName, run_and_compare=False
         )
 
+    # pyre-fixme[16]: Module `parameterized.parameterized` has no attribute `expand`.
     @parameterized.expand(
         [
             [(7, 5, 6), (7, 5, 6)],
diff --git a/extension/llm/export/quantizer_lib.py b/extension/llm/export/quantizer_lib.py
index ba281864a9..3a9eebd2c3 100644
--- a/extension/llm/export/quantizer_lib.py
+++ b/extension/llm/export/quantizer_lib.py
@@ -184,14 +184,12 @@ def get_qnn_quantizer(
         )
         qnn_quantizer.set_per_channel_conv_quant(enable=False)
         qnn_quantizer.set_per_channel_linear_quant(enable=False)
-        # pyre-ignore: Undefined attribute [16]: Module `executorch.backends` has no attribute `qualcomm`.
         qnn_quantizer.set_quant_config(
             quant_dtype, is_qat=is_qat, act_observer=MinMaxObserver
         )
     elif quant_config == "16a4w":
         # pyre-ignore: Undefined attribute [16]: Module `executorch.backends` has no attribute `qualcomm`.
         quant_dtype = QuantDtype.use_16a4w
-        # pyre-ignore: Undefined attribute [16]: Module `executorch.backends` has no attribute `qualcomm`.
         qnn_quantizer.set_quant_config(
             quant_dtype, is_qat=is_qat, act_observer=MinMaxObserver
         )

From 651af12b19e9aa109ce974a0d95a1710b397d6c4 Mon Sep 17 00:00:00 2001
From: lg-zhang <zhanglg921@gmail.com>
Date: Wed, 27 Nov 2024 16:59:46 -0800
Subject: [PATCH 16/27] use ovrsource libtorch in executorch

Differential Revision: D66526578

Pull Request resolved: https://github.com/pytorch/executorch/pull/7101
---
 extension/pytree/aten_util/targets.bzl | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

diff --git a/extension/pytree/aten_util/targets.bzl b/extension/pytree/aten_util/targets.bzl
index e179308020..5ba7e90596 100644
--- a/extension/pytree/aten_util/targets.bzl
+++ b/extension/pytree/aten_util/targets.bzl
@@ -20,13 +20,7 @@ def define_common_targets():
             "//executorch/runtime/platform:platform",
         ],
         compiler_flags = ["-Wno-missing-prototypes"],
-        fbcode_deps = [
-            "//caffe2:ATen-core",
-            "//caffe2:ATen-cpu",
-            "//caffe2/c10:c10",
-        ],
-        xplat_deps = [
-            "//xplat/caffe2:torch_mobile_core",
-            "//xplat/caffe2/c10:c10",
+        external_deps = [
+            "torch-core-cpp",
         ],
     )

From d243ffecf790295be0716c28b019b50d7fa13147 Mon Sep 17 00:00:00 2001
From: Hannes Friederich <hfriederich@meta.com>
Date: Thu, 28 Nov 2024 10:13:49 +0100
Subject: [PATCH 17/27] Back out "use ovrsource libtorch in executorch"

Differential Revision: D66570005

Pull Request resolved: https://github.com/pytorch/executorch/pull/7122
---
 extension/pytree/aten_util/targets.bzl | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/extension/pytree/aten_util/targets.bzl b/extension/pytree/aten_util/targets.bzl
index 5ba7e90596..e179308020 100644
--- a/extension/pytree/aten_util/targets.bzl
+++ b/extension/pytree/aten_util/targets.bzl
@@ -20,7 +20,13 @@ def define_common_targets():
             "//executorch/runtime/platform:platform",
         ],
         compiler_flags = ["-Wno-missing-prototypes"],
-        external_deps = [
-            "torch-core-cpp",
+        fbcode_deps = [
+            "//caffe2:ATen-core",
+            "//caffe2:ATen-cpu",
+            "//caffe2/c10:c10",
+        ],
+        xplat_deps = [
+            "//xplat/caffe2:torch_mobile_core",
+            "//xplat/caffe2/c10:c10",
         ],
     )

From 7c934db080b9ecd97190d76f7fc815dfa53e7f5a Mon Sep 17 00:00:00 2001
From: Zingo Andersen <zingo.andersen@arm.com>
Date: Tue, 19 Nov 2024 09:23:09 +0100
Subject: [PATCH 18/27] Arm Backend: Update Ethos-U compiler Vela to 4.1.0

This fix a code generation problem.
Some Ethos-U85 tests starts working and some Ethos-U55 tests are
disabled due to more stricts testing added to vela compiler.

Signed-off-by: Zingo Andersen <zingo.andersen@arm.com>
Change-Id: I53a10a1675cea34e105e04f864dfa3cb4cc626fa
---
 backends/arm/test/ops/test_bmm.py            | 19 +++++---
 backends/arm/test/ops/test_conv_combos.py    |  2 -
 backends/arm/test/ops/test_depthwise_conv.py | 25 ++++++++---
 backends/arm/test/ops/test_div.py            | 46 +++++++++++++++-----
 backends/arm/test/ops/test_layer_norm.py     |  7 ++-
 backends/arm/test/ops/test_logsoftmax.py     | 46 ++++++++++++++++----
 backends/arm/test/ops/test_mean_dim.py       |  4 +-
 backends/arm/test/ops/test_mul.py            |  7 +--
 backends/arm/test/ops/test_softmax.py        | 29 +++++++++++-
 backends/arm/test/ops/test_sum.py            | 24 +++++++++-
 backends/arm/test/ops/test_var.py            | 32 ++++++++++++--
 examples/arm/aot_arm_compiler.py             |  4 +-
 examples/arm/setup.sh                        | 11 +++--
 13 files changed, 201 insertions(+), 55 deletions(-)

diff --git a/backends/arm/test/ops/test_bmm.py b/backends/arm/test/ops/test_bmm.py
index 824ec46372..2cf90b2119 100644
--- a/backends/arm/test/ops/test_bmm.py
+++ b/backends/arm/test/ops/test_bmm.py
@@ -22,8 +22,8 @@ class TestBMM(unittest.TestCase):
 
     class BMM(torch.nn.Module):
         test_parameters = [
-            (torch.rand(5, 3, 5), torch.rand(5, 5, 2)),
             (torch.rand(2, 1, 1), torch.rand(2, 1, 1)),
+            (torch.rand(5, 3, 5), torch.rand(5, 5, 2)),
             (torch.ones(1, 55, 3), torch.ones(1, 3, 44)),
             (10000 * torch.randn(10, 1, 10), torch.randn(10, 10, 5)),
             (-10 * torch.randn(2, 32, 64), 5 + 5 * torch.randn(2, 64, 32)),
@@ -147,32 +147,37 @@ def test_bmm_single_input_tosa_BI(self, operand1: torch.Tensor):
 
     @parameterized.expand(BMM.test_parameters)
     @unittest.expectedFailure
-    def test_bmm_u55_BI(self, operand1: torch.Tensor, operand2: torch.Tensor):
+    def test_bmm_u55_BI_xfails(self, operand1: torch.Tensor, operand2: torch.Tensor):
         test_data = (operand1, operand2)
         self._test_bmm_ethosu_BI_pipeline(
             self.BMM(), common.get_u55_compile_spec(), test_data
         )
 
-    @parameterized.expand(BMM.test_parameters)
-    @common.expectedFailureOnFVP
+    @parameterized.expand(BMM.test_parameters[:1])
     def test_bmm_u85_BI(self, operand1: torch.Tensor, operand2: torch.Tensor):
         test_data = (operand1, operand2)
         self._test_bmm_ethosu_BI_pipeline(
             self.BMM(), common.get_u85_compile_spec(), test_data
         )
 
+    @parameterized.expand(BMM.test_parameters[1:])
+    @unittest.expectedFailure
+    def test_bmm_u85_BI_xfails(self, operand1: torch.Tensor, operand2: torch.Tensor):
+        test_data = (operand1, operand2)
+        self._test_bmm_ethosu_BI_pipeline(
+            self.BMM(), common.get_u85_compile_spec(), test_data
+        )
+
     # Expected to fail with error: Warning, unsupported fusing of TOSA Rescale previous operator is of type: Memcpy
     @parameterized.expand(BMMSingleInput.test_parameters)
     @unittest.expectedFailure
-    def test_bmm_single_input_u55_BI(self, operand1: torch.Tensor):
+    def test_bmm_single_input_u55_BI_xfails(self, operand1: torch.Tensor):
         test_data = (operand1,)
         self._test_bmm_ethosu_BI_pipeline(
             self.BMMSingleInput(), common.get_u55_compile_spec(), test_data
         )
 
-    # Numerical issues on FVP, MLETORCH 534
     @parameterized.expand(BMMSingleInput.test_parameters)
-    @common.expectedFailureOnFVP
     def test_bmm_single_input_u85_BI(self, operand1: torch.Tensor):
         test_data = (operand1,)
         self._test_bmm_ethosu_BI_pipeline(
diff --git a/backends/arm/test/ops/test_conv_combos.py b/backends/arm/test/ops/test_conv_combos.py
index 7555fff720..001c4a2bd5 100644
--- a/backends/arm/test/ops/test_conv_combos.py
+++ b/backends/arm/test/ops/test_conv_combos.py
@@ -275,8 +275,6 @@ def test_conv_meandim_u55_BI(self):
             model.get_inputs(),
         )
 
-    # Numerical Issues on FVP, MLETORCH-520
-    @common.expectedFailureOnFVP
     def test_conv_meandim_u85_BI(self):
         model = ComboConv2dMeandim()
         self._test_conv_combo_ethos_BI_pipeline(
diff --git a/backends/arm/test/ops/test_depthwise_conv.py b/backends/arm/test/ops/test_depthwise_conv.py
index 28cb9ac844..628b25c259 100644
--- a/backends/arm/test/ops/test_depthwise_conv.py
+++ b/backends/arm/test/ops/test_depthwise_conv.py
@@ -156,6 +156,19 @@
     ("two_dw_conv2d", two_dw_conv2d),
 ]
 
+testsuite_conv2d_u85 = [
+    ("2x2_1x6x4x4_gp6_st1", dw_conv2d_2x2_1x6x4x4_gp6_st1),
+    ("3x3_1x3x256x256_gp3_st1", dw_conv2d_3x3_1x3x256x256_gp3_st1),
+    ("3x3_1x4x256x256_gp4_st1", dw_conv2d_3x3_1x4x256x256_gp4_st1),
+    ("3x3_1x4x256x256_gp4_nobias", dw_conv2d_3x3_1x4x256x256_gp4_nobias),
+]
+
+testsuite_conv2d_u85_xfails = [
+    ("3x3_2x8x198x198_gp8_st3", dw_conv2d_3x3_2x8x198x198_gp8_st3),
+    ("two_dw_conv2d", two_dw_conv2d),
+]
+
+
 testsuite_conv1d = [
     ("2_1x6x4_gp6_st1", dw_conv1d_2_1x6x4_gp6_st1),
     ("two_dw_conv1d", two_dw_conv1d),
@@ -274,10 +287,8 @@ def test_dw_conv1d_u55_BI(
             model.get_inputs(),
         )
 
-    # All test cases except 3x3_1x3x256x256_gp3_st1 have numerical issues on FVP. MLETORCH-520
-    @parameterized.expand(testsuite_conv1d[:-2] + testsuite_conv2d)
-    @common.expectedFailureOnFVP
-    def test_dw_conv_u85_BI_xfails(
+    @parameterized.expand(testsuite_conv1d + testsuite_conv2d_u85)
+    def test_dw_conv_u85_BI(
         self, test_name: str, model: torch.nn.Module, set_quantize_io: bool = False
     ):
         self._test_dw_conv_ethos_BI_pipeline(
@@ -288,8 +299,10 @@ def test_dw_conv_u85_BI_xfails(
             model.get_inputs(),
         )
 
-    @parameterized.expand(testsuite_conv1d[-2:])
-    def test_dw_conv_u85_BI(
+    # All test cases except 3x3_1x3x256x256_gp3_st1 have numerical issues on FVP. MLETORCH-520
+    @parameterized.expand(testsuite_conv2d_u85_xfails)
+    @common.expectedFailureOnFVP
+    def test_dw_conv_u85_BI_xfails(
         self, test_name: str, model: torch.nn.Module, set_quantize_io: bool = False
     ):
         self._test_dw_conv_ethos_BI_pipeline(
diff --git a/backends/arm/test/ops/test_div.py b/backends/arm/test/ops/test_div.py
index b3815f3e7c..27febd714e 100644
--- a/backends/arm/test/ops/test_div.py
+++ b/backends/arm/test/ops/test_div.py
@@ -26,18 +26,18 @@
         torch.ones(5),
         None,
     ),
-    (
-        "op_div_rank1_rand",
-        torch.rand(5) * 5,
-        torch.rand(5) * 5,
-        None,
-    ),
     (
         "op_div_rank1_negative_ones",
         torch.ones(5) * (-1),
         torch.ones(5) * (-1),
         None,
     ),
+    (
+        "op_div_rank1_rand",
+        torch.rand(5) * 5,
+        torch.rand(5) * 5,
+        None,
+    ),
     (
         "op_div_rank4_ones",
         torch.ones(5, 10, 25, 20),
@@ -183,9 +183,7 @@ def test_div_tosa_BI(
         test_data = (input_, other_)
         self._test_div_tosa_BI_pipeline(self.Div(), test_data)
 
-    # Numerical issues on FVP likely due to mul op, MLETORCH-521
-    @parameterized.expand(test_data_suite)
-    @common.expectedFailureOnFVP
+    @parameterized.expand(test_data_suite[:2])
     def test_div_u55_BI(
         self,
         test_name: str,
@@ -199,8 +197,21 @@ def test_div_u55_BI(
         )
 
     # Numerical issues on FVP likely due to mul op, MLETORCH-521
-    @parameterized.expand(test_data_suite)
+    @parameterized.expand(test_data_suite[2:])
     @common.expectedFailureOnFVP
+    def test_div_u55_BI_xfails(
+        self,
+        test_name: str,
+        input_: Union[torch.Tensor, torch.types.Number],
+        other_: Union[torch.Tensor, torch.types.Number],
+        rounding_mode: Optional[str] = None,
+    ):
+        test_data = (input_, other_)
+        self._test_div_ethos_BI_pipeline(
+            self.Div(), common.get_u55_compile_spec(), test_data
+        )
+
+    @parameterized.expand(test_data_suite[:2])
     def test_div_u85_BI(
         self,
         test_name: str,
@@ -212,3 +223,18 @@ def test_div_u85_BI(
         self._test_div_ethos_BI_pipeline(
             self.Div(), common.get_u85_compile_spec(), test_data
         )
+
+    # Numerical issues on FVP likely due to mul op, MLETORCH-521
+    @parameterized.expand(test_data_suite[2:])
+    @common.expectedFailureOnFVP
+    def test_div_u85_BI_xfails(
+        self,
+        test_name: str,
+        input_: Union[torch.Tensor, torch.types.Number],
+        other_: Union[torch.Tensor, torch.types.Number],
+        rounding_mode: Optional[str] = None,
+    ):
+        test_data = (input_, other_)
+        self._test_div_ethos_BI_pipeline(
+            self.Div(), common.get_u85_compile_spec(), test_data
+        )
diff --git a/backends/arm/test/ops/test_layer_norm.py b/backends/arm/test/ops/test_layer_norm.py
index 0b06044a59..7375a25383 100644
--- a/backends/arm/test/ops/test_layer_norm.py
+++ b/backends/arm/test/ops/test_layer_norm.py
@@ -170,9 +170,8 @@ def test_layer_norm_u55_BI(
         )
 
     # Numerical issues on FVP likely due to mul op, MLETORCH-521
-    @parameterized.expand(test_data_suite[:-1])
-    @common.expectedFailureOnFVP
-    def test_layer_norm_u85_BI_fvp_xfails(
+    @parameterized.expand(test_data_suite[:-2])
+    def test_layer_norm_u85_BI_fvp(
         self,
         test_name: str,
         test_data: torch.Tensor,
@@ -182,7 +181,7 @@ def test_layer_norm_u85_BI_fvp_xfails(
             self.LayerNorm(*model_params), common.get_u85_compile_spec(), (test_data,)
         )
 
-    @parameterized.expand(test_data_suite[-1:])
+    @parameterized.expand(test_data_suite[-2:])
     @unittest.skip  # Flaky
     def test_layer_norm_u85_BI(
         self,
diff --git a/backends/arm/test/ops/test_logsoftmax.py b/backends/arm/test/ops/test_logsoftmax.py
index 5d84fa127f..910384e0a0 100644
--- a/backends/arm/test/ops/test_logsoftmax.py
+++ b/backends/arm/test/ops/test_logsoftmax.py
@@ -17,14 +17,29 @@
 
 test_data_suite = [
     # (test_name, test_data, dim)
-    ("zeros", torch.zeros(10, 10, 10, 10), 0),
-    ("zeros_neg_dim", torch.zeros(10, 10, 10, 10), -4),
+    ("zeros", torch.zeros(10, 8, 5, 2), 0),
+    ("zeros_neg_dim", torch.zeros(10, 7, 8, 9), -4),
     ("ones", torch.ones(10, 10), 1),
-    ("rand_neg_dim", torch.rand(10, 10, 10), -1),
-    ("rand", torch.rand(10, 10, 10, 10), 2),
-    ("rand_neg_dim", torch.rand(10, 10, 2, 3), -2),
-    ("randn", torch.randn(10, 10, 5, 10), 3),
-    ("randn_neg_dim", torch.randn(1, 10, 10, 10), -3),
+    ("ones_neg_dim", torch.ones(10, 3, 4), -1),
+    ("rand", torch.rand(1, 2, 5, 8), 2),
+    ("rand_neg_dim", torch.rand(2, 10, 8, 10), -2),
+    ("randn", torch.randn(10, 10, 10, 10), 3),
+    ("randn_neg_dim", torch.randn(10, 5, 8, 7), -3),
+]
+test_data_suite_u55 = [
+    # (test_name, test_data, dim)
+    ("ones", torch.ones(10, 10), 1),
+    ("ones_neg_dim", torch.ones(10, 3, 4), -1),
+    ("randn_neg_dim", torch.randn(10, 5, 8, 7), -3),
+]
+
+test_data_suite_u55_xfails = [
+    # (test_name, test_data, dim)
+    ("zeros", torch.zeros(10, 8, 5, 2), 0),
+    ("zeros_neg_dim", torch.zeros(10, 7, 8, 9), -4),
+    ("rand", torch.rand(1, 2, 5, 8), 2),
+    ("rand_neg_dim", torch.rand(2, 10, 8, 10), -2),
+    ("randn", torch.randn(10, 10, 10, 10), 3),
 ]
 
 
@@ -135,7 +150,7 @@ def test_logsoftmax_tosa_BI(
     ):
         self._test_logsoftmax_tosa_BI_pipeline(self.LogSoftmax(dim=dim), (test_data,))
 
-    @parameterized.expand(test_data_suite)
+    @parameterized.expand(test_data_suite_u55)
     def test_logsoftmax_tosa_u55_BI(
         self,
         test_name: str,
@@ -146,6 +161,19 @@ def test_logsoftmax_tosa_u55_BI(
             self.LogSoftmax(dim=dim), (test_data,)
         )
 
+    # Expected to fail as this is not supported on u55.
+    @parameterized.expand(test_data_suite_u55_xfails)
+    @unittest.expectedFailure
+    def test_logsoftmax_tosa_u55_BI_xfails(
+        self,
+        test_name: str,
+        test_data: torch.Tensor,
+        dim: int,
+    ):
+        self._test_logsoftmax_tosa_u55_BI_pipeline(
+            self.LogSoftmax(dim=dim), (test_data,)
+        )
+
     @parameterized.expand(test_data_suite)
     def test_logsoftmax_tosa_u85_BI(
         self,
@@ -153,6 +181,6 @@ def test_logsoftmax_tosa_u85_BI(
         test_data: torch.Tensor,
         dim: int,
     ):
-        self._test_logsoftmax_tosa_u55_BI_pipeline(
+        self._test_logsoftmax_tosa_u85_BI_pipeline(
             self.LogSoftmax(dim=dim), (test_data,)
         )
diff --git a/backends/arm/test/ops/test_mean_dim.py b/backends/arm/test/ops/test_mean_dim.py
index e8320cf1df..3cb8c5f815 100644
--- a/backends/arm/test/ops/test_mean_dim.py
+++ b/backends/arm/test/ops/test_mean_dim.py
@@ -269,8 +269,10 @@ def test_meandim_tosa_BI(
     ):
         self._test_meandim_tosa_BI_pipeline(self.MeanDim(dim, keepdim), (test_data,))
 
+    # Expected to fail as this is not supported on u55.
     @parameterized.expand(MeanDim.test_data_suite)
-    def test_meandim_tosa_u55_BI(
+    @unittest.expectedFailure
+    def test_meandim_tosa_u55_BI_xfails(
         self,
         test_name: str,
         test_data: torch.Tensor,
diff --git a/backends/arm/test/ops/test_mul.py b/backends/arm/test/ops/test_mul.py
index 8f0321ea5f..6d6922628e 100644
--- a/backends/arm/test/ops/test_mul.py
+++ b/backends/arm/test/ops/test_mul.py
@@ -152,9 +152,7 @@ def test_mul_tosa_BI(
         test_data = (input_, other_)
         self._test_mul_tosa_BI_pipeline(self.Mul(), test_data)
 
-    # Numerical issues on FVP, MLETORCH-521
     @parameterized.expand(test_data_sute)
-    @common.expectedFailureOnFVP
     def test_mul_u55_BI(
         self,
         test_name: str,
@@ -166,10 +164,7 @@ def test_mul_u55_BI(
             common.get_u55_compile_spec(), self.Mul(), test_data
         )
 
-    # Numerical issues on FVP, MLETORCH-521
-    # test_data_sute[0] works on U85
-    @parameterized.expand(test_data_sute[1:])
-    @common.expectedFailureOnFVP
+    @parameterized.expand(test_data_sute)
     def test_mul_u85_BI(
         self,
         test_name: str,
diff --git a/backends/arm/test/ops/test_softmax.py b/backends/arm/test/ops/test_softmax.py
index f883d6b8de..30215b47f3 100644
--- a/backends/arm/test/ops/test_softmax.py
+++ b/backends/arm/test/ops/test_softmax.py
@@ -28,6 +28,22 @@
     ("randn_neg_dim", torch.randn(10, 5, 8, 7), -3),
 ]
 
+test_data_suite_u55 = [
+    # (test_name, test_data, dim)
+    ("ones", torch.ones(10, 10), 1),
+    ("ones_neg_dim", torch.ones(10, 3, 4), -1),
+    ("randn_neg_dim", torch.randn(10, 5, 8, 7), -3),
+]
+
+test_data_suite_u55_xfails = [
+    # (test_name, test_data, dim)
+    ("zeros", torch.zeros(10, 8, 5, 2), 0),
+    ("zeros_neg_dim", torch.zeros(10, 7, 8, 9), -4),
+    ("rand", torch.rand(1, 2, 5, 8), 2),
+    ("rand_neg_dim", torch.rand(2, 10, 8, 10), -2),
+    ("randn", torch.randn(10, 10, 10, 10), 3),
+]
+
 
 class TestSoftmax(unittest.TestCase):
     """Tests softmax."""
@@ -136,7 +152,7 @@ def test_softmax_tosa_BI(
     ):
         self._test_softmax_tosa_BI_pipeline(self.Softmax(dim=dim), (test_data,))
 
-    @parameterized.expand(test_data_suite)
+    @parameterized.expand(test_data_suite_u55)
     def test_softmax_tosa_u55_BI(
         self,
         test_name: str,
@@ -145,6 +161,17 @@ def test_softmax_tosa_u55_BI(
     ):
         self._test_softmax_tosa_u55_BI_pipeline(self.Softmax(dim=dim), (test_data,))
 
+    # Expected to fail as this is not supported on u55.
+    @parameterized.expand(test_data_suite_u55_xfails)
+    @unittest.expectedFailure
+    def test_softmax_tosa_u55_BI_xfails(
+        self,
+        test_name: str,
+        test_data: torch.Tensor,
+        dim: int,
+    ):
+        self._test_softmax_tosa_u55_BI_pipeline(self.Softmax(dim=dim), (test_data,))
+
     @parameterized.expand(test_data_suite)
     def test_softmax_tosa_u85_BI(
         self,
diff --git a/backends/arm/test/ops/test_sum.py b/backends/arm/test/ops/test_sum.py
index 9cd63b0a22..111517afbb 100644
--- a/backends/arm/test/ops/test_sum.py
+++ b/backends/arm/test/ops/test_sum.py
@@ -35,6 +35,18 @@ class Sum(torch.nn.Module):
             ((torch.rand(1, 2, 8, 8), [2, 3, 0], True),),
         ]
 
+        test_parameters_u55: list[Tuple[exampledata_t]] = [
+            ((torch.rand(10), 0, True),),
+            ((torch.rand(10, 10), 1, False),),
+            ((torch.rand(1, 2, 3, 4), 3, True),),
+        ]
+
+        test_parameters_u55_xfails: list[Tuple[exampledata_t]] = [
+            ((torch.rand(10, 10, 10), [-3, 1], True),),
+            ((torch.rand(2, 1, 5, 8), 1, False),),
+            ((torch.rand(1, 2, 8, 8), [2, 3, 0], True),),
+        ]
+
         def forward(self, x: torch.Tensor, dim: int, keepdim: bool):
             return x.sum(dim=dim, keepdim=keepdim)
 
@@ -112,7 +124,7 @@ def test_sum_tosa_MI(self, test_data: tuple[exampledata_t]):
     def test_sum_tosa_BI(self, test_data: tuple[exampledata_t]):
         self._test_sum_tosa_BI_pipeline(self.Sum(), test_data)
 
-    @parameterized.expand(Sum.test_parameters)
+    @parameterized.expand(Sum.test_parameters_u55)
     def test_sum_u55_BI(self, test_data: tuple[exampledata_t]):
         self._test_sum_ethosu_BI_pipeline(
             self.Sum(),
@@ -120,6 +132,16 @@ def test_sum_u55_BI(self, test_data: tuple[exampledata_t]):
             common.get_u55_compile_spec(permute_memory_to_nhwc=False),
         )
 
+    # Expected to fail as this is not supported on u55.
+    @parameterized.expand(Sum.test_parameters_u55_xfails)
+    @unittest.expectedFailure
+    def test_sum_u55_BI_xfails(self, test_data: tuple[exampledata_t]):
+        self._test_sum_ethosu_BI_pipeline(
+            self.Sum(),
+            test_data,
+            common.get_u55_compile_spec(permute_memory_to_nhwc=False),
+        )
+
     @parameterized.expand(Sum.test_parameters)
     def test_sum_u85_BI(self, test_data: tuple[exampledata_t]):
         self._test_sum_ethosu_BI_pipeline(
diff --git a/backends/arm/test/ops/test_var.py b/backends/arm/test/ops/test_var.py
index 3a1285e6da..06671848cc 100644
--- a/backends/arm/test/ops/test_var.py
+++ b/backends/arm/test/ops/test_var.py
@@ -50,6 +50,16 @@ class VarDim(torch.nn.Module):
             (torch.rand(1, 50, 10, 20), -1, True, True),
         ]
 
+        test_parameters_u55 = [
+            (torch.randn(1, 50, 10, 20), 1, True, False),
+            (torch.randn(1, 30, 15, 20), -3, True, True),
+        ]
+
+        test_parameters_u55_xfails = [
+            (torch.rand(1, 50, 10), -2, True, False),
+            (torch.rand(1, 50, 10, 20), -1, True, True),
+        ]
+
         def forward(
             self,
             x: torch.Tensor,
@@ -148,8 +158,10 @@ def test_var_tosa_MI(self, test_tensor: torch.Tensor, keepdim, correction):
     def test_var_tosa_BI(self, test_tensor: torch.Tensor, keepdim, correction):
         self._test_var_tosa_BI_pipeline(self.Var(), (test_tensor, keepdim, correction))
 
+    # Expected to fail as this is not supported on u55.
     @parameterized.expand(Var.test_parameters)
-    def test_var_u55_BI(self, test_tensor: torch.Tensor, keepdim, correction):
+    @unittest.expectedFailure
+    def test_var_u55_BI_xfails(self, test_tensor: torch.Tensor, keepdim, correction):
         self._test_var_ethosu_BI_pipeline(
             self.Var(),
             common.get_u55_compile_spec(),
@@ -176,7 +188,7 @@ def test_var_dim_tosa_BI(self, test_tensor: torch.Tensor, dim, keepdim, correcti
             self.VarDim(), (test_tensor, dim, keepdim, correction)
         )
 
-    @parameterized.expand(VarDim.test_parameters)
+    @parameterized.expand(VarDim.test_parameters_u55)
     def test_var_dim_u55_BI(self, test_tensor: torch.Tensor, dim, keepdim, correction):
         self._test_var_ethosu_BI_pipeline(
             self.VarDim(),
@@ -184,6 +196,18 @@ def test_var_dim_u55_BI(self, test_tensor: torch.Tensor, dim, keepdim, correctio
             (test_tensor, dim, keepdim, correction),
         )
 
+    # Expected to fail as this is not supported on u55.
+    @parameterized.expand(VarDim.test_parameters_u55_xfails)
+    @unittest.expectedFailure
+    def test_var_dim_u55_BI_xfails(
+        self, test_tensor: torch.Tensor, dim, keepdim, correction
+    ):
+        self._test_var_ethosu_BI_pipeline(
+            self.VarDim(),
+            common.get_u55_compile_spec(),
+            (test_tensor, dim, keepdim, correction),
+        )
+
     @parameterized.expand(VarDim.test_parameters)
     def test_var_dim_u85_BI(self, test_tensor: torch.Tensor, dim, keepdim, correction):
         self._test_var_ethosu_BI_pipeline(
@@ -208,8 +232,10 @@ def test_var_correction_tosa_BI(
             self.VarCorrection(), (test_tensor, dim, keepdim, correction)
         )
 
+    # Expected to fail as this is not supported on u55.
     @parameterized.expand(VarCorrection.test_parameters)
-    def test_var_correction_u55_BI(
+    @unittest.expectedFailure
+    def test_var_correction_u55_BI_xfails(
         self, test_tensor: torch.Tensor, dim, keepdim, correction
     ):
         self._test_var_ethosu_BI_pipeline(
diff --git a/examples/arm/aot_arm_compiler.py b/examples/arm/aot_arm_compiler.py
index ddd5fd6b0b..a16d947dd6 100644
--- a/examples/arm/aot_arm_compiler.py
+++ b/examples/arm/aot_arm_compiler.py
@@ -263,7 +263,7 @@ def get_compile_spec(
                 target,
                 system_config="Ethos_U55_High_End_Embedded",
                 memory_mode="Shared_Sram",
-                extra_flags="--debug-force-regor --output-format=raw",
+                extra_flags="--debug-force-regor --output-format=raw --verbose-operators --verbose-cycle-estimate",
             )
             .set_permute_memory_format(True)
             .set_quantize_io(True)
@@ -276,7 +276,7 @@ def get_compile_spec(
                 target,
                 system_config="Ethos_U85_SYS_DRAM_Mid",
                 memory_mode="Shared_Sram",
-                extra_flags="--output-format=raw",
+                extra_flags="--output-format=raw --verbose-operators --verbose-cycle-estimate",
             )
             .set_permute_memory_format(True)
             .set_quantize_io(True)
diff --git a/examples/arm/setup.sh b/examples/arm/setup.sh
index 583237729d..84f2371466 100755
--- a/examples/arm/setup.sh
+++ b/examples/arm/setup.sh
@@ -89,7 +89,11 @@ ethos_u_base_rev="24.08"
 # tosa reference model
 tosa_reference_model_url="https://review.mlplatform.org/tosa/reference_model"
 tosa_reference_model_rev="f9ea4ab7da19318fe36b1c34d68a3e40fd6e56c5"
- 
+
+# vela
+vela_repo_url="https://review.mlplatform.org/ml/ethos-u/ethos-u-vela"
+vela_rev="a08fc18780827b5fefc814dd0162ee6317ce0ae7"
+
 ########
 ### Mandatory user args
 ########
@@ -198,6 +202,7 @@ function setup_ethos_u() {
     cd ethos-u
     git reset --hard ${ethos_u_base_rev}
     python3 ./fetch_externals.py -c ${ethos_u_base_rev}.json fetch
+
     pip install pyelftools
     echo "[${FUNCNAME[0]}] Done @ $(git describe --all --long 3> /dev/null) in ${root_dir}/ethos-u dir."
 }
@@ -259,9 +264,9 @@ function setup_vela() {
     #
     cd "${root_dir}"
     if [[ ! -e ethos-u-vela ]]; then
-        git clone https://review.mlplatform.org/ml/ethos-u/ethos-u-vela
+        git clone ${vela_repo_url}
         repo_dir="${root_dir}/ethos-u-vela"
-        base_rev=57ce18c89ccc6f6309333dccb24ed30dc68b571f
+        base_rev=${vela_rev}
         patch_repo
     fi
     cd "${root_dir}/ethos-u-vela"

From e83ab0e06a8f1b4153df8f136f3a8ad0455dbd7c Mon Sep 17 00:00:00 2001
From: Zingo Andersen <zingo.andersen@arm.com>
Date: Tue, 19 Nov 2024 09:23:09 +0100
Subject: [PATCH 19/27] Arm Backend: Update Ethos-U compiler Vela to 4.1.0

This fix a code generation problem.
Some Ethos-U85 tests starts working and some Ethos-U55 tests are
disabled due to more stricts testing added to vela compiler.

Signed-off-by: Zingo Andersen <zingo.andersen@arm.com>
Change-Id: I53a10a1675cea34e105e04f864dfa3cb4cc626fa
---
 backends/arm/test/ops/test_bmm.py            | 2 +-
 backends/arm/test/ops/test_depthwise_conv.py | 2 +-
 backends/arm/test/ops/test_layer_norm.py     | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/backends/arm/test/ops/test_bmm.py b/backends/arm/test/ops/test_bmm.py
index 2cf90b2119..523a90cdc8 100644
--- a/backends/arm/test/ops/test_bmm.py
+++ b/backends/arm/test/ops/test_bmm.py
@@ -161,7 +161,7 @@ def test_bmm_u85_BI(self, operand1: torch.Tensor, operand2: torch.Tensor):
         )
 
     @parameterized.expand(BMM.test_parameters[1:])
-    @unittest.expectedFailure
+    @common.expectedFailureOnFVP
     def test_bmm_u85_BI_xfails(self, operand1: torch.Tensor, operand2: torch.Tensor):
         test_data = (operand1, operand2)
         self._test_bmm_ethosu_BI_pipeline(
diff --git a/backends/arm/test/ops/test_depthwise_conv.py b/backends/arm/test/ops/test_depthwise_conv.py
index 628b25c259..d753245f43 100644
--- a/backends/arm/test/ops/test_depthwise_conv.py
+++ b/backends/arm/test/ops/test_depthwise_conv.py
@@ -260,7 +260,7 @@ def test_dw_conv_tosa_BI(self, test_name: str, model: torch.nn.Module):
     )  # Works
 
     @parameterized.expand(testsuite_conv2d, skip_on_empty=True)
-    @common.expectedFailureOnFVP
+    @unittest.expectedFailure
     def test_dw_conv2d_u55_BI(
         self, test_name: str, model: torch.nn.Module, set_quantize_io: bool = False
     ):
diff --git a/backends/arm/test/ops/test_layer_norm.py b/backends/arm/test/ops/test_layer_norm.py
index 7375a25383..e84dd4ee58 100644
--- a/backends/arm/test/ops/test_layer_norm.py
+++ b/backends/arm/test/ops/test_layer_norm.py
@@ -158,7 +158,7 @@ def test_layer_norm_tosa_BI(
     # Numerical issues on FVP likely due to mul op, MLETORCH-521
     # Skip tests that require transposes.
     @parameterized.expand(test_data_suite[:-2])
-    @common.expectedFailureOnFVP
+    @unittest.expectedFailure
     def test_layer_norm_u55_BI(
         self,
         test_name: str,

From 2f61fbb5ebae4db9b8f9c85d3588abc3392c4d77 Mon Sep 17 00:00:00 2001
From: Zingo Andersen <zingo.andersen@arm.com>
Date: Tue, 15 Oct 2024 15:39:22 +0200
Subject: [PATCH 20/27] Arm backend: Updated toolchain to
 arm-gnu-toolchain-13.3.rel1

Updated toolchain for no other reason then to get all general improvements.

Signed-off-by: Zingo Andersen <zingo.andersen@arm.com>
Change-Id: If65f3986a0011e99f9b0c57bdb072dce6edb97ef
---
 examples/arm/setup.sh | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/examples/arm/setup.sh b/examples/arm/setup.sh
index 84f2371466..6f619ef058 100755
--- a/examples/arm/setup.sh
+++ b/examples/arm/setup.sh
@@ -55,9 +55,9 @@ if [[ "${ARCH}" == "x86_64" ]]; then
     corstone320_md5_checksum="3deb3c68f9b2d145833f15374203514d"
 
     # toochain
-    toolchain_url="https://armkeil.blob.core.windows.net/developer/Files/downloads/gnu/12.3.rel1/binrel/arm-gnu-toolchain-12.3.rel1-x86_64-arm-none-eabi.tar.xz"
-    toolchain_dir="arm-gnu-toolchain-12.3.rel1-x86_64-arm-none-eabi"
-    toolchain_md5_checksum="00ebb1b70b1f88906c61206457eacb61"
+    toolchain_url="https://armkeil.blob.core.windows.net/developer/Files/downloads/gnu/13.3.rel1/binrel/arm-gnu-toolchain-13.3.rel1-x86_64-arm-none-eabi.tar.xz"
+    toolchain_dir="arm-gnu-toolchain-13.3.rel1-x86_64-arm-none-eabi"
+    toolchain_md5_checksum="0601a9588bc5b9c99ad2b56133b7f118"
 elif [[ "${ARCH}" == "aarch64" ]] || [[ "${ARCH}" == "arm64" ]]; then
     # FVPs
     corstone300_url="https://developer.arm.com/-/media/Arm%20Developer%20Community/Downloads/OSS/FVP/Corstone-300/FVP_Corstone_SSE-300_11.22_20_Linux64_armv8l.tgz?rev=9cc6e9a32bb947ca9b21fa162144cb01&hash=7657A4CF27D42E892E3F08D452AAB073"
@@ -70,13 +70,13 @@ elif [[ "${ARCH}" == "aarch64" ]] || [[ "${ARCH}" == "arm64" ]]; then
 
     # toochain
     if [[ "${OS}" == "Darwin" ]]; then
-        toolchain_url="https://armkeil.blob.core.windows.net/developer/Files/downloads/gnu/12.3.rel1/binrel/arm-gnu-toolchain-12.3.rel1-darwin-arm64-arm-none-eabi.tar.xz"
-        toolchain_dir="arm-gnu-toolchain-12.3.rel1-darwin-arm64-arm-none-eabi"
-        toolchain_md5_checksum="53d034e9423e7f470acc5ed2a066758e"
+        toolchain_url="https://armkeil.blob.core.windows.net/developer/Files/downloads/gnu/13.3.rel1/binrel/arm-gnu-toolchain-13.3.rel1-darwin-arm64-arm-none-eabi.tar.xz"
+        toolchain_dir="arm-gnu-toolchain-13.3.rel1-darwin-arm64-arm-none-eabi"
+        toolchain_md5_checksum="f1c18320bb3121fa89dca11399273f4e"
     elif [[ "${OS}" == "Linux" ]]; then
-        toolchain_url="https://armkeil.blob.core.windows.net/developer/Files/downloads/gnu/12.3.rel1/binrel/arm-gnu-toolchain-12.3.rel1-aarch64-arm-none-eabi.tar.xz"
-        toolchain_dir="arm-gnu-toolchain-12.3.rel1-aarch64-arm-none-eabi"
-        toolchain_md5_checksum="02c9b0d3bb1110575877d8eee1f223f2"
+        toolchain_url="https://armkeil.blob.core.windows.net/developer/Files/downloads/gnu/13.3.rel1/binrel/arm-gnu-toolchain-13.3.rel1-aarch64-arm-none-eabi.tar.xz"
+        toolchain_dir="arm-gnu-toolchain-13.3.rel1-aarch64-arm-none-eabi"
+        toolchain_md5_checksum="303102d97b877ebbeb36b3158994b218"
     fi
 else
     echo "[main] Error: only x86-64 & aarch64/arm64 architecture is supported for now!"; exit 1;
@@ -178,15 +178,15 @@ function setup_fvp() {
 function setup_toolchain() {
     # Download and install the arm-none-eabi toolchain
     cd "${root_dir}"
-    if [[ ! -e gcc.tar.xz ]]; then
+    if [[ ! -e "${toolchain_dir}.tar.xz" ]]; then
         echo "[${FUNCNAME[0]}] Downloading toolchain ..."
-        curl --output gcc.tar.xz "${toolchain_url}"
-        verify_md5 ${toolchain_md5_checksum} gcc.tar.xz
+        curl --output "${toolchain_dir}.tar.xz" "${toolchain_url}"
+        verify_md5 ${toolchain_md5_checksum} "${toolchain_dir}.tar.xz"
     fi
 
     echo "[${FUNCNAME[0]}] Installing toolchain ..."
     rm -rf "${toolchain_dir}"
-    tar xf gcc.tar.xz
+    tar xf "${toolchain_dir}.tar.xz"
     toolchain_bin_path="$(cd ${toolchain_dir}/bin && pwd)"
     export PATH=${PATH}:${toolchain_bin_path}
     hash arm-none-eabi-gcc

From fc50da13d4ec2051a8db12cf9190669e9cd8cbee Mon Sep 17 00:00:00 2001
From: Eashan Garg <skrtskrt@meta.com>
Date: Wed, 20 Nov 2024 22:00:29 -0800
Subject: [PATCH 21/27] Buckify arm/test files

Summary: Buckify non-test arm files, to allow ArmTester to be used internally

Differential Revision: D66283212
---
 backends/arm/TARGETS      | 11 +++++++++++
 backends/arm/test/TARGETS | 23 +++++++++++++++++++++++
 2 files changed, 34 insertions(+)
 create mode 100644 backends/arm/test/TARGETS

diff --git a/backends/arm/TARGETS b/backends/arm/TARGETS
index bd42710d7b..05f6095c37 100644
--- a/backends/arm/TARGETS
+++ b/backends/arm/TARGETS
@@ -110,3 +110,14 @@ python_library(
         "//executorch/backends/arm/operators:node_visitor",
     ],
 )
+
+python_library(
+    name = "arm_model_evaluator",
+    src = [
+        "util/arm_model_evaluator.py",
+    ],
+    typing = True,
+    deps = [
+        "//caffe2:torch",
+    ]
+)
diff --git a/backends/arm/test/TARGETS b/backends/arm/test/TARGETS
new file mode 100644
index 0000000000..ef092c5503
--- /dev/null
+++ b/backends/arm/test/TARGETS
@@ -0,0 +1,23 @@
+load("@fbcode_macros//build_defs:python_library.bzl", "python_library")
+
+python_library(
+    name = "common",
+    srcs = ["common.py"],
+    deps = [
+        "//executorch/backends/xnnpack/test/tester:tester",
+        "//executorch/backends/arm:arm_backend",
+        "//executorch/exir:lib",
+        "//executorch/exir/backend:compile_spec_schema",
+    ]
+)
+
+python_library(
+    name = "runner_utils",
+    srcs = ["runner_utils.py"],
+    deps = [
+        "//executorch/backends/xnnpack/test/tester:tester",
+        "//executorch/backends/arm:arm_backend",
+        "//executorch/exir:lib",
+        "//executorch/exir/backend:compile_spec_schema",
+    ]
+)

From 12281264631969c1936cafebeba6e6b403e4cb72 Mon Sep 17 00:00:00 2001
From: Benjamin Klimczak <benjamin.klimczak@arm.com>
Date: Mon, 11 Nov 2024 15:37:23 +0000
Subject: [PATCH 22/27] Add support for torch.ops.aten._to_copy.default

Lower torch.ops.aten._to_copy.default to TOSA CAST op. This resolves
issues around arithmetic operators when using int scalars in unquantized
networks (see new test cases in test_scalars.py).

Note: Parameter 'memory_format' is not supported.

Change-Id: I7a921ca510c5b46f15b5399218f9230ba0f93d88
---
 backends/arm/operator_support/__init__.py     |   1 +
 .../arm/operator_support/to_copy_support.py   | 120 ++++++++++++++++++
 backends/arm/operators/__init__.py            |   1 +
 backends/arm/operators/op_to_copy.py          |  43 +++++++
 backends/arm/test/ops/test_scalars.py         |  16 ++-
 backends/arm/test/ops/test_to_copy.py         |  70 ++++++++++
 6 files changed, 249 insertions(+), 2 deletions(-)
 create mode 100644 backends/arm/operator_support/to_copy_support.py
 create mode 100644 backends/arm/operators/op_to_copy.py
 create mode 100644 backends/arm/test/ops/test_to_copy.py

diff --git a/backends/arm/operator_support/__init__.py b/backends/arm/operator_support/__init__.py
index c133ce8003..297047963c 100644
--- a/backends/arm/operator_support/__init__.py
+++ b/backends/arm/operator_support/__init__.py
@@ -8,6 +8,7 @@
 from . import (  # noqa
     mean_dim_support,
     right_shift_support,
+    to_copy_support,
     tosa_supported_operators,
     var_correction_support,
 )
diff --git a/backends/arm/operator_support/to_copy_support.py b/backends/arm/operator_support/to_copy_support.py
new file mode 100644
index 0000000000..9bba274804
--- /dev/null
+++ b/backends/arm/operator_support/to_copy_support.py
@@ -0,0 +1,120 @@
+# Copyright 2024 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# pyre-unsafe
+import logging
+
+import torch
+
+import torch.fx as fx
+
+from executorch.backends.arm.operator_support.tosa_supported_operators import (
+    register_tosa_support_check,
+    SupportedTOSAOperatorCheck,
+)
+from executorch.backends.arm.tosa_specification import TosaSpecification
+from executorch.exir.dialects._ops import ops as exir_ops
+
+logger = logging.getLogger(__name__)
+
+
+@register_tosa_support_check
+class ToCopySupported(SupportedTOSAOperatorCheck):
+    targets = [exir_ops.edge.aten._to_copy.default]
+
+    tosa_specs = [
+        TosaSpecification.create_from_string("TOSA-0.80.0+BI"),
+        TosaSpecification.create_from_string("TOSA-0.80.0+MI"),
+    ]
+
+    SupportedTypeDict = dict[torch.dtype, list[torch.dtype]]
+
+    @staticmethod
+    def _merge_supported_types(
+        dtypes1: SupportedTypeDict, dtypes2: SupportedTypeDict
+    ) -> SupportedTypeDict:
+        merged_dtypes = dtypes1
+        for k, v in dtypes2.items():
+            merged_dtypes[k] = merged_dtypes.get(k, []) + v
+        return merged_dtypes
+
+    SUPPORTED_INT_TYPES: SupportedTypeDict = {
+        torch.bool: [torch.int8, torch.int16, torch.int32],
+        torch.int8: [torch.bool, torch.int16, torch.int32],
+        torch.int16: [torch.bool, torch.int8, torch.int32],
+        torch.int32: [torch.bool, torch.int8, torch.int16],
+    }
+    SUPPORTED_FLOAT_TYPES: SupportedTypeDict = {
+        torch.int8: [torch.float16, torch.bfloat16, torch.float32],
+        torch.int16: [torch.float16, torch.bfloat16, torch.float32],
+        torch.int32: [torch.float16, torch.bfloat16, torch.float32],
+        torch.bfloat16: [torch.int8, torch.int16, torch.int32, torch.float32],
+        torch.float16: [torch.int8, torch.int16, torch.int32, torch.float32],
+        torch.float32: [
+            torch.int8,
+            torch.int16,
+            torch.int32,
+            torch.bfloat16,
+            torch.float16,
+        ],
+    }
+    ALL_SUPPORTED_TYPES = _merge_supported_types(
+        SUPPORTED_INT_TYPES, SUPPORTED_FLOAT_TYPES
+    )
+    POSSIBLE_TYPE_CONVERSIONS = {torch.int64: torch.int32}
+
+    def is_node_supported(self, node: fx.Node, tosa_spec: TosaSpecification) -> bool:
+        assert node.target in self.targets
+
+        if tosa_spec not in self.tosa_specs:
+            return False
+
+        assert tosa_spec.support_integer()
+        supported_dtypes = (
+            self.ALL_SUPPORTED_TYPES
+            if tosa_spec.support_float()
+            else self.SUPPORTED_INT_TYPES
+        )
+        # Take into account possible type conversions
+        supported_dtypes.update(
+            (k, supported_dtypes[v])
+            for k, v in self.POSSIBLE_TYPE_CONVERSIONS.items()
+            if v in supported_dtypes
+        )
+
+        # Check input type
+        assert len(node.all_input_nodes) == 1
+        input_val = node.all_input_nodes[0].meta["val"]
+        assert isinstance(input_val, torch._subclasses.FakeTensor)
+        input_dtype = input_val.dtype
+        if input_dtype not in supported_dtypes:
+            logger.info(
+                f"Input dtype {input_val.dtype} is not supported in "
+                f"{node.target.name()}."
+            )
+            return False
+
+        # Check output type
+        output_val = node.meta["val"]
+        assert isinstance(output_val, torch._subclasses.FakeTensor)
+        if output_val.dtype not in supported_dtypes[input_dtype]:
+            logger.info(
+                f"Output dtype {output_val.dtype} is not supported in "
+                f"{node.target.name()} for input dtype {input_dtype}. "
+                f"Supported output types: "
+                f"{''.join(str(t) for t in supported_dtypes[input_dtype])}"
+            )
+            return False
+
+        # Check memory format
+        if "memory_format" in node.kwargs:
+            if node.kwargs["memory_format"] in (torch.preserve_format,):
+                logger.info(
+                    f"Argument 'memory_format' is not supported for "
+                    f"{node.target.name()} right now."
+                )
+                return False
+
+        return True
diff --git a/backends/arm/operators/__init__.py b/backends/arm/operators/__init__.py
index a5c2dd8dc5..8c4aa85e57 100644
--- a/backends/arm/operators/__init__.py
+++ b/backends/arm/operators/__init__.py
@@ -36,6 +36,7 @@
     op_sub,
     op_sum,
     op_tanh,
+    op_to_copy,
     op_transpose,
     op_unsqueeze,
     op_upsample_nearest2d,
diff --git a/backends/arm/operators/op_to_copy.py b/backends/arm/operators/op_to_copy.py
new file mode 100644
index 0000000000..15077d6df7
--- /dev/null
+++ b/backends/arm/operators/op_to_copy.py
@@ -0,0 +1,43 @@
+# Copyright 2024 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# pyre-unsafe
+from typing import List
+
+import serializer.tosa_serializer as ts
+import torch
+import tosa.Op as TosaOp
+
+from executorch.backends.arm.operators.node_visitor import (
+    NodeVisitor,
+    register_node_visitor,
+)
+from executorch.backends.arm.tosa_mapping import TosaArg
+
+
+@register_node_visitor
+class ToCopyVisitor(NodeVisitor):
+    """
+    Implement the type cast functionality of _to_copy.
+
+    Other features like setting of the memory_format or moving a tensor to a
+    different device are not supported.
+
+    Also note that the node should not be quantized.
+    """
+
+    target = "aten._to_copy.default"
+
+    def define_node(
+        self,
+        node: torch.fx.Node,
+        tosa_graph: ts.TosaSerializer,
+        inputs: List[TosaArg],
+        output: TosaArg,
+        is_quant_node: bool,
+    ) -> None:
+        assert not is_quant_node, "Casting of quantized values is not supported."
+        assert inputs
+        tosa_graph.addOperator(TosaOp.Op().CAST, [inputs[0].name], [output.name])
diff --git a/backends/arm/test/ops/test_scalars.py b/backends/arm/test/ops/test_scalars.py
index cd3dd72f60..455b484b94 100644
--- a/backends/arm/test/ops/test_scalars.py
+++ b/backends/arm/test/ops/test_scalars.py
@@ -153,9 +153,21 @@ def _test_add_tosa_BI_pipeline(self, module: torch.nn.Module, test_data: tuple):
             .run_method_and_compare_outputs(inputs=test_data)
         )
 
-    # Most MI tests fail, just show one working for now.
-    @parameterized.expand((tensor_scalar_tests[6],))
+    @parameterized.expand(tensor_scalar_tests)
     def test_MI(self, test_name: str, op: torch.nn.Module, x, y):
+        expected_exception = None
+        if any(token in test_name for token in ("Sub_int", "Sub__int")):
+            expected_exception = RuntimeError
+        elif test_name.endswith("_st"):
+            expected_exception = AttributeError
+
+        if expected_exception:
+            with self.assertRaises(
+                expected_exception, msg=f"Test {test_name} is expected to fail."
+            ):
+                self._test_add_tosa_MI_pipeline(op, (x, y))
+            return
+
         self._test_add_tosa_MI_pipeline(op, (x, y))
 
     # op(Scalar float, tensor) works if the scalar is constant.
diff --git a/backends/arm/test/ops/test_to_copy.py b/backends/arm/test/ops/test_to_copy.py
new file mode 100644
index 0000000000..8499512e10
--- /dev/null
+++ b/backends/arm/test/ops/test_to_copy.py
@@ -0,0 +1,70 @@
+# Copyright 2024 Arm Limited and/or its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+#
+# Tests the _to_copy op which is interpreted as a cast for our purposes.
+#
+
+import unittest
+
+import torch
+
+from executorch.backends.arm.test import common
+from executorch.backends.arm.test.tester.arm_tester import ArmTester
+
+from parameterized import parameterized
+
+
+class Cast(torch.nn.Module):
+    def __init__(self, target_dtype):
+        super().__init__()
+        self.target_dtype = target_dtype
+
+    def forward(self, x: torch.Tensor):
+        return x.to(dtype=self.target_dtype)
+
+
+class TestToCopy(unittest.TestCase):
+    """
+    Tests the _to_copy operation.
+
+    Only test unquantized graphs as explicit casting of dtypes messes with the
+    quantization.
+
+    Note: This is also covered by test_scalars.py.
+    """
+
+    _TO_COPY_TEST_DATA = (
+        (torch.rand((1, 2, 3, 4), dtype=torch.float16), torch.float32),
+        (torch.rand((1, 2, 3, 4), dtype=torch.float32), torch.float16),
+        (torch.randint(-127, 128, (1, 2, 3, 4), dtype=torch.int8), torch.float32),
+        (torch.randint(-127, 128, (1, 2, 3, 4), dtype=torch.int8), torch.int32),
+        (torch.randint(-127, 128, (1, 2, 3, 4), dtype=torch.int32), torch.int8),
+    )
+
+    def _test_to_copy_tosa_MI_pipeline(
+        self, module: torch.nn.Module, test_data: torch.Tensor
+    ):
+        (
+            ArmTester(
+                module,
+                example_inputs=test_data,
+                compile_spec=common.get_tosa_compile_spec("TOSA-0.80.0+MI"),
+            )
+            .export()
+            .dump_artifact()
+            .check_count({"torch.ops.aten._to_copy.default": 1})
+            .to_edge()
+            .dump_artifact()
+            .partition()
+            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
+            .to_executorch()
+            .run_method_and_compare_outputs(inputs=test_data)
+        )
+
+    @parameterized.expand(_TO_COPY_TEST_DATA)
+    def test_view_tosa_MI(self, test_tensor: torch.Tensor, new_dtype):
+        self._test_to_copy_tosa_MI_pipeline(Cast(new_dtype), (test_tensor,))

From 3475707dcdba611818db6bebafec4cc5691b3499 Mon Sep 17 00:00:00 2001
From: AIWintermuteAI <32562299+AIWintermuteAI@users.noreply.github.com>
Date: Thu, 28 Nov 2024 17:24:29 +0100
Subject: [PATCH 23/27] Update run.sh to use arm-none-eabi-size

---
 examples/arm/run.sh | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/examples/arm/run.sh b/examples/arm/run.sh
index 0e5fa9db34..cbc96c4b11 100755
--- a/examples/arm/run.sh
+++ b/examples/arm/run.sh
@@ -213,9 +213,9 @@ function build_executorch_runner() {
     cmake --build ${executor_runner_path}/cmake-out --parallel -- arm_executor_runner
     echo "[${FUNCNAME[0]}] Generated baremetal elf file:"
     find ${executor_runner_path}/cmake-out -name "arm_executor_runner"
-    echo "executable_text: $(find ${executor_runner_path}/cmake-out -name arm_executor_runner -exec size {} \; | grep -v filename | awk '{print $1}') bytes"
-    echo "executable_data: $(find ${executor_runner_path}/cmake-out -name arm_executor_runner -exec size {} \; | grep -v filename | awk '{print $2}') bytes"
-    echo "executable_bss:  $(find ${executor_runner_path}/cmake-out -name arm_executor_runner -exec size {} \; | grep -v filename | awk '{print $3}') bytes"
+    echo "executable_text: $(find ${executor_runner_path}/cmake-out -name arm_executor_runner -exec arm-none-eabi-size {} \; | grep -v filename | awk '{print $1}') bytes"
+    echo "executable_data: $(find ${executor_runner_path}/cmake-out -name arm_executor_runner -exec arm-none-eabi-size {} \; | grep -v filename | awk '{print $2}') bytes"
+    echo "executable_bss:  $(find ${executor_runner_path}/cmake-out -name arm_executor_runner -exec arm-none-eabi-size {} \; | grep -v filename | awk '{print $3}') bytes"
 }
 
 # Execute the executor_runner on FVP Simulator

From 8af65d35c341a8ae74f250d7f9885f0ad9f3b33a Mon Sep 17 00:00:00 2001
From: Saoirse Stewart <saoirse.stewart@arm.com>
Date: Thu, 28 Nov 2024 15:14:00 +0000
Subject: [PATCH 24/27] Update the ArmBackend to check the total amount of
 dimensions of the output tensors

* Adding multiple output sample model to arm_aot_compiler
---
 backends/arm/runtime/ArmBackendEthosU.cpp | 72 +++++++++++++++--------
 examples/arm/aot_arm_compiler.py          | 10 ++++
 2 files changed, 57 insertions(+), 25 deletions(-)

diff --git a/backends/arm/runtime/ArmBackendEthosU.cpp b/backends/arm/runtime/ArmBackendEthosU.cpp
index 99ce0a9df2..a14c42140e 100644
--- a/backends/arm/runtime/ArmBackendEthosU.cpp
+++ b/backends/arm/runtime/ArmBackendEthosU.cpp
@@ -138,6 +138,7 @@ class ArmBackend final : public ::executorch::runtime::BackendInterface {
     // TODO(MLETORCH-123): Optimise into direct write from Vela into the SRAM
     //                     or DRAM output for compatible data layouts.
     for (int i = 0; i < handles.inputs->count; i++) {
+      auto tensor_count = 1, io_count = 1;
       auto tensor_in = args[i]->toTensor();
       char* scratch_addr = handles.scratch_data + handles.inputs->io[i].offset;
 
@@ -202,6 +203,19 @@ class ArmBackend final : public ::executorch::runtime::BackendInterface {
         ET_LOG(Error, "No matching input copy routine");
         return Error::InvalidProgram;
       }
+      if (!permuted_input_shape) {
+        calculate_dimensions(
+            tensor_in, &handles.inputs->io[i], &tensor_count, &io_count);
+        if (tensor_count != io_count) {
+          ET_LOG(Error, "Input tensor sizes do not match");
+          ET_LOG(
+              Error,
+              "Program expects %d elements but got %d",
+              io_count,
+              tensor_count);
+          return Error::InvalidProgram;
+        }
+      }
     }
 
     // Allocate driver handle and synchronously invoke driver
@@ -236,14 +250,24 @@ class ArmBackend final : public ::executorch::runtime::BackendInterface {
           result);
       return Error::InvalidProgram;
     }
-
+    int tensor_dim = 0, io_dim = 0;
     // Write outputs from scratch into EValue pointers
     for (int i = 0; i < handles.outputs->count; i++) {
+      int tensor_count = 1, io_count = 1;
       const char* output_addr =
           handles.scratch_data + handles.outputs->io[i].offset;
       // Process input EValue into scratch
       // Outputs are in the index immediately after inputs
       auto tensor_out = args[handles.inputs->count + i]->toTensor();
+
+      calculate_dimensions(
+          tensor_out, &handles.outputs->io[i], &tensor_count, &io_count);
+
+      // At times the topological order of the outputs may change.
+      // Lets instead ensure that the sum of dimensions match.
+      tensor_dim = tensor_dim + tensor_count;
+      io_dim = io_dim + io_count;
+
       bool permuted_output_shape;
       ET_CHECK_OK_OR_RETURN_ERROR(check_requires_permute(
           i,
@@ -272,6 +296,12 @@ class ArmBackend final : public ::executorch::runtime::BackendInterface {
         }
       }
     }
+    if (tensor_dim != io_dim) {
+      ET_LOG(Error, "Total output tensor sizes do not match");
+      ET_LOG(
+          Error, "Program expects size of %d but got %d", tensor_dim, io_dim);
+      return Error::InvalidProgram;
+    }
     return Error::Ok;
   }
 
@@ -280,6 +310,21 @@ class ArmBackend final : public ::executorch::runtime::BackendInterface {
   }
 
  private:
+  void calculate_dimensions(
+      const executorch::aten::Tensor tensor,
+      VelaIO* io,
+      int* tensor_count,
+      int* io_count) const {
+    for (int i = 0; i < tensor.dim(); i++) {
+      *tensor_count = *tensor_count * tensor.size(i);
+    }
+
+    // The VelaIO type has a shape of fixed size 4
+    for (int i = 0; i < 4; i++) {
+      *io_count = *io_count * io->shape[i];
+    }
+  }
+
   Error check_requires_permute(
       int index,
       const executorch::aten::Tensor tensor,
@@ -287,6 +332,7 @@ class ArmBackend final : public ::executorch::runtime::BackendInterface {
       bool permuted_io_flag,
       bool* is_permuted) const {
     bool permuted_shape = false;
+
     if (tensor.dim() == 4) {
       // special case for NHWC workaround in AOT; as the compilation has
       // permuted to channel last in an undetectable way, we assume here
@@ -304,30 +350,6 @@ class ArmBackend final : public ::executorch::runtime::BackendInterface {
         return Error::InvalidProgram;
       }
     }
-    if (!permuted_shape) {
-      // Check the number of elements in each tensor match
-      int tensor_count = 1;
-      int io_count = 1;
-
-      for (int i = 0; i < tensor.dim(); i++) {
-        tensor_count = tensor_count * tensor.size(i);
-      }
-
-      // The VelaIO type has a shape of fixed size 4
-      for (int i = 0; i < 4; i++) {
-        io_count = io_count * io->shape[i];
-      }
-
-      if (tensor_count != io_count) {
-        ET_LOG(Error, "Input tensor sizes do not match");
-        ET_LOG(
-            Error,
-            "Program expects %d elements but got %d",
-            io_count,
-            tensor_count);
-        return Error::InvalidProgram;
-      }
-    }
     *is_permuted = permuted_shape;
     return Error::Ok;
   }
diff --git a/examples/arm/aot_arm_compiler.py b/examples/arm/aot_arm_compiler.py
index a16d947dd6..6d899c2146 100644
--- a/examples/arm/aot_arm_compiler.py
+++ b/examples/arm/aot_arm_compiler.py
@@ -172,11 +172,21 @@ def forward(self, x):
     can_delegate = False
 
 
+class MultipleOutputsModule(torch.nn.Module):
+
+    def forward(self, x: torch.Tensor, y: torch.Tensor):
+        return (x * y, x.sum(dim=-1, keepdim=True))
+
+    example_input = (torch.randn(10, 4, 5), torch.randn(10, 4, 5))
+    can_delegate = True
+
+
 models = {
     "add": AddModule,
     "add2": AddModule2,
     "add3": AddModule3,
     "softmax": SoftmaxModule,
+    "MultipleOutputsModule": MultipleOutputsModule,
 }
 
 calibration_data = {

From 1c9abfa6fa33fd365d51919669d159c4babf8057 Mon Sep 17 00:00:00 2001
From: Adrian Lundell <adrian.lundell@arm.com>
Date: Wed, 13 Nov 2024 11:08:59 +0100
Subject: [PATCH 25/27] [Arm backend] Support keep_dims == True for meandim and
 var ops

- Adds keepdim support in decompose_var/ decompose_meandim passes
- Renames insert_squeeze_after_sum to more general name for future ops
- Adds get/set_node_args help functions
- Updates TOSASupportedOperators

Change-Id: Ifda19d1c3ed67d03d0c896bf4f74253d875354cc
---
 backends/arm/_passes/arm_pass_manager.py      |  6 +-
 backends/arm/_passes/arm_pass_utils.py        | 58 +++++++++++++++++++
 .../arm/_passes/decompose_meandim_pass.py     | 13 +++--
 backends/arm/_passes/decompose_var_pass.py    | 27 +++++----
 ....py => keep_dims_false_to_squeeze_pass.py} | 42 +++++++++++---
 backends/arm/operator_support/__init__.py     |  8 +--
 .../arm/operator_support/mean_dim_support.py  | 33 -----------
 .../tosa_supported_operators.py               |  3 +
 .../var_correction_support.py                 | 33 -----------
 backends/arm/test/ops/test_mean_dim.py        |  4 +-
 backends/arm/test/ops/test_var.py             |  8 +--
 .../passes/test_meandim_to_averagepool2d.py   |  8 ++-
 12 files changed, 135 insertions(+), 108 deletions(-)
 rename backends/arm/_passes/{insert_squeeze_after_sum_pass.py => keep_dims_false_to_squeeze_pass.py} (58%)
 delete mode 100644 backends/arm/operator_support/mean_dim_support.py
 delete mode 100644 backends/arm/operator_support/var_correction_support.py

diff --git a/backends/arm/_passes/arm_pass_manager.py b/backends/arm/_passes/arm_pass_manager.py
index a72cdfd1a0..1e2b26ef64 100644
--- a/backends/arm/_passes/arm_pass_manager.py
+++ b/backends/arm/_passes/arm_pass_manager.py
@@ -29,8 +29,8 @@
     DecomposeSoftmaxesPass,
 )
 from executorch.backends.arm._passes.decompose_var_pass import DecomposeVarPass
-from executorch.backends.arm._passes.insert_squeeze_after_sum_pass import (
-    InsertSqueezeAfterSumPass,
+from executorch.backends.arm._passes.keep_dims_false_to_squeeze_pass import (
+    KeepDimsFalseToSqueezePass,
 )
 from executorch.backends.arm._passes.match_arg_ranks_pass import MatchArgRanksPass
 from executorch.backends.arm._passes.meandim_to_averagepool_pass import (
@@ -71,7 +71,7 @@ def transform_to_backend_pipeline(
         self.add_pass(DecomposeMeanDimPass())
         self.add_pass(MatchArgRanksPass(exported_program))
         self.add_pass(DecomposeDivPass())
-        self.add_pass(InsertSqueezeAfterSumPass())
+        self.add_pass(KeepDimsFalseToSqueezePass())
         self.add_pass(ConvertSplitToSlicePass())
         self.add_pass(Conv1dUnsqueezePass(exported_program))
         self.add_pass(DecomposeSoftmaxesPass())
diff --git a/backends/arm/_passes/arm_pass_utils.py b/backends/arm/_passes/arm_pass_utils.py
index 3fcf724e5b..78ee6e265c 100644
--- a/backends/arm/_passes/arm_pass_utils.py
+++ b/backends/arm/_passes/arm_pass_utils.py
@@ -7,6 +7,7 @@
 
 # pyre-unsafe
 
+from inspect import isclass
 from typing import Optional
 
 import torch
@@ -133,3 +134,60 @@ def get_first_fake_tensor(node: torch.fx.Node) -> FakeTensor:
         fake_tensor, FakeTensor
     ), f'Found {fake_tensor} in meta["val"] of {node}, expected to find FakeTensor.'
     return fake_tensor
+
+
+def get_node_arg(args: list | dict, key: int | str | type, default_value=None):
+    """
+    Help-function for getting a value from node.args/ kwargs, three cases:
+    1. By position in node.args - Returns arg at given position or default_value if index is one out of bounds
+    2. By key in node.kwargs - Returns kwarg with given key or default_value if it deos not exist
+    3. By type in node.args - Returns first arg of args of given type. Useful for cases where arg postions may differ but types are unique.
+    """
+    if isinstance(key, int):
+        if 0 <= key < len(args):
+            return args[key]
+        elif key == len(args):
+            if default_value is not None:
+                return default_value
+            else:
+                raise RuntimeError(f"No defult value given for index {key}")
+        else:
+            raise RuntimeError(
+                f"Out of bounds index {key} for getting value in args (of size {len(args)})"
+            )
+    elif isinstance(key, str):
+        return args.get(key, default_value)
+    elif isclass(key):
+        for arg in args:
+            if isinstance(arg, key):
+                return arg
+        if default_value is not None:
+            return default_value
+        else:
+            raise RuntimeError(f"No arg of type {key}")
+    else:
+        raise RuntimeError("Invalid type")
+
+
+def set_node_arg(node: torch.fx.Node, i: int | str, value):
+    """
+    Help-function for setting a value in node.args/ kwargs. If the index is one larger than the list size, the value is instead appended to the list.
+    """
+    if isinstance(i, int):
+        if 0 <= i < len(node.args):
+            args = list(node.args)
+            args[i] = value
+            node.args = tuple(args)
+            return
+        elif i == len(node.args):
+            node.args = node.args + (value,)
+        else:
+            raise RuntimeError(
+                f"Out of bounds index {i} for setting value in {node} args (of size {len(node.args)})"
+            )
+    elif isinstance(i, str):
+        kwargs = dict(node.kwargs)
+        kwargs[i] = value
+        node.kwargs = kwargs
+    else:
+        raise RuntimeError("Invalid type")
diff --git a/backends/arm/_passes/decompose_meandim_pass.py b/backends/arm/_passes/decompose_meandim_pass.py
index d927fd613c..abf5c8f363 100644
--- a/backends/arm/_passes/decompose_meandim_pass.py
+++ b/backends/arm/_passes/decompose_meandim_pass.py
@@ -7,6 +7,7 @@
 # pyre-unsafe
 
 import torch
+from executorch.backends.arm._passes.arm_pass_utils import get_node_arg
 from executorch.exir.dialects._ops import ops as exir_ops
 from executorch.exir.pass_base import ExportPass
 
@@ -42,16 +43,16 @@ def call_operator(self, op, args, kwargs, meta):
         if op not in (exir_ops.edge.aten.mean.dim, torch.ops.aten.mean.dim):
             return super().call_operator(op, args, kwargs, meta)
 
-        x = args[0]
-        dim = args[1]
-        keepdim = args[2] if len(args) > 2 else False
-        if not keepdim:
-            return super().call_operator(op, args, kwargs, meta)
-        # if keepdim == True and dim == [-1, -2], mean.dim can be
+        x = get_node_arg(args, 0)
+        dim = get_node_arg(args, 1)
+        keepdim = get_node_arg(args, 2, False)
+
+        # if dim == [-1, -2], mean.dim can be
         # decomposed to avg_pool2d. This is handled by ConvertMeanDimToAveragePool.
         if dim == [-1, -2]:
             # Simply return the mean.dim operator for future decomposition.
             return super().call_operator(op, args, kwargs, meta)
+
         shape = meta["val"].size()
         dtype = meta["val"].dtype
         input_shape = x.data.size()
diff --git a/backends/arm/_passes/decompose_var_pass.py b/backends/arm/_passes/decompose_var_pass.py
index cc8f0eb6da..283760e423 100644
--- a/backends/arm/_passes/decompose_var_pass.py
+++ b/backends/arm/_passes/decompose_var_pass.py
@@ -8,6 +8,7 @@
 
 
 import torch
+from executorch.backends.arm._passes.arm_pass_utils import get_node_arg
 from executorch.exir.dialects._ops import ops as exir_ops
 from executorch.exir.pass_base import ExportPass
 
@@ -53,26 +54,30 @@ def call_operator(self, op, args, kwargs, meta):
             torch.ops.aten.var.dim,
         ):
             return super().call_operator(op, args, kwargs, meta)
-        shape = meta["val"].size()
+
+        x = args[0]
+        input_shape = x.data.size()
+        shape = list(meta["val"].size())
+        if shape == []:
+            shape = [1 for _ in input_shape]
+
         dtype = meta["val"].dtype
-        dim = args[1] if len(args) > 1 else list(range(len(shape)))
+        # Get dim from args based on argument type
+        dim = get_node_arg(args, key=list, default_value=list(range(len(shape))))
+
         if op == torch.ops.aten.var.dim:
-            correction = args[-2]
-            keepdim = args[-1]
+            keepdim = get_node_arg(args, bool, False)
+            correction = get_node_arg(args, int, 1)
         else:
-            correction = kwargs["correction"]
-            keepdim = kwargs.get("keepdim", False)
-        if not keepdim:
-            return super().call_operator(op, args, kwargs, meta)
+            correction = get_node_arg(kwargs, "correction", 1)
+            keepdim = get_node_arg(kwargs, "keepdim", False)
 
-        x = args[0]
-        input_shape = x.data.size()
         N = 1
         for d in dim:
             N *= input_shape[d]
 
         mean_op, diff_op, mul_op, sum_op, full_op = get_var_decomposition(op)
-        mean = super().call_operator(mean_op, (x, dim, keepdim), {}, meta)
+        mean = super().call_operator(mean_op, (x, dim, True), {}, meta)
         diff = super().call_operator(diff_op, (x, mean), {}, meta)
         squared_diff = super().call_operator(mul_op, (diff, diff), {}, meta)
         sum = super().call_operator(sum_op, (squared_diff, dim, keepdim), {}, meta)
diff --git a/backends/arm/_passes/insert_squeeze_after_sum_pass.py b/backends/arm/_passes/keep_dims_false_to_squeeze_pass.py
similarity index 58%
rename from backends/arm/_passes/insert_squeeze_after_sum_pass.py
rename to backends/arm/_passes/keep_dims_false_to_squeeze_pass.py
index e088c2e35a..736c627d91 100644
--- a/backends/arm/_passes/insert_squeeze_after_sum_pass.py
+++ b/backends/arm/_passes/keep_dims_false_to_squeeze_pass.py
@@ -10,14 +10,18 @@
 
 import torch
 import torch.fx
-from executorch.backends.arm._passes.arm_pass_utils import create_node
+from executorch.backends.arm._passes.arm_pass_utils import (
+    create_node,
+    get_node_arg,
+    set_node_arg,
+)
 from executorch.exir.dialects._ops import ops as exir_ops
 from executorch.exir.pass_base import ExportPass, PassResult
 
 
-class InsertSqueezeAfterSumPass(ExportPass):
+class KeepDimsFalseToSqueezePass(ExportPass):
     """
-    In Pytorch, the default behaviour of Tensor.sum is to squeeze
+    In Pytorch, the default behaviour of for example Tensor.sum is to squeeze
     the dimension that is summed (keep_dim = False).
     However, in TOSA, REDUCE_SUM always preserves the
     rank of the input (keep_dim = True).
@@ -31,21 +35,44 @@ class InsertSqueezeAfterSumPass(ExportPass):
         squeeze(dim = dims)
     """
 
+    # CURRENTLY NOT HANDLED OPS
+    # exir_ops.edge.aten.amax,
+    # exir_ops.edge.aten.amin,
+    # exir_ops.edge.aten.any.dim,
+    # exir_ops.edge.aten.any.dims,
+    # exir_ops.edge.aten.argmax,
+    # exir_ops.edge.aten.argmin,
+    # exir_ops.edge.aten.max.dim,
+    # exir_ops.edge.aten.min.dim,
+    # exir_ops.edge.aten.prod.dim_int,
+
+    # HANDLED OPS
+    # exir_ops.edge.aten.sum.dim_IntList
+    # exir_ops.edge.aten.var.correction (decomposed in decompose_var_pass)
+    # exir_ops.edge.aten.var.dim (decomposed in decompose_var_pass)
+    # exir_ops.edge.aten.mean.dim (decomposed in decompose_meandim_pass)
+
     def call(self, graph_module: torch.fx.GraphModule):
         for node in graph_module.graph.nodes:
+            keep_dim_index = None
+
             if node.op != "call_function":
                 continue
-            if node.target != exir_ops.edge.aten.sum.dim_IntList:
+            if node.target == exir_ops.edge.aten.sum.dim_IntList:
+                keep_dim_index = 2
+            else:
                 continue
+
             sum_node = cast(torch.fx.Node, node)
-            keep_dim = cast(bool, sum_node.args[2] if len(sum_node.args) > 2 else False)
+            keep_dim = get_node_arg(sum_node.args, keep_dim_index, False)
+
             if keep_dim:
                 continue
 
-            dim_list = cast(list[int], sum_node.args[1])
+            dim_list = get_node_arg(sum_node.args, 1, [0])
 
             # Add keep_dim = True arg to sum node.
-            sum_node.args = sum_node.args[0:2] + (True,)
+            set_node_arg(sum_node, 2, True)
 
             with graph_module.graph.inserting_after(sum_node):
                 squeeze_node = create_node(
@@ -53,6 +80,7 @@ def call(self, graph_module: torch.fx.GraphModule):
                 )
                 sum_node.replace_all_uses_with(squeeze_node)
                 squeeze_node.args = (sum_node, dim_list)
+
         graph_module.graph.eliminate_dead_code()
         graph_module.recompile()
         graph_module = super().call(graph_module).graph_module
diff --git a/backends/arm/operator_support/__init__.py b/backends/arm/operator_support/__init__.py
index 297047963c..08f58b1e43 100644
--- a/backends/arm/operator_support/__init__.py
+++ b/backends/arm/operator_support/__init__.py
@@ -5,10 +5,4 @@
 
 # pyre-unsafe
 
-from . import (  # noqa
-    mean_dim_support,
-    right_shift_support,
-    to_copy_support,
-    tosa_supported_operators,
-    var_correction_support,
-)
+from . import right_shift_support, to_copy_support, tosa_supported_operators  # noqa
diff --git a/backends/arm/operator_support/mean_dim_support.py b/backends/arm/operator_support/mean_dim_support.py
deleted file mode 100644
index 67a7c20406..0000000000
--- a/backends/arm/operator_support/mean_dim_support.py
+++ /dev/null
@@ -1,33 +0,0 @@
-# Copyright 2024 Arm Limited and/or its affiliates.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-# pyre-unsafe
-
-from typing import cast
-
-import torch.fx as fx
-
-from executorch.backends.arm.operator_support.tosa_supported_operators import (
-    register_tosa_support_check,
-    SupportedTOSAOperatorCheck,
-)
-from executorch.backends.arm.tosa_specification import TosaSpecification
-from executorch.exir.dialects._ops import ops as exir_ops
-
-
-@register_tosa_support_check
-class MeanDimSupported(SupportedTOSAOperatorCheck):
-    targets = [exir_ops.edge.aten.mean.dim]
-
-    tosa_specs = [
-        TosaSpecification.create_from_string("TOSA-0.80.0+BI"),
-        TosaSpecification.create_from_string("TOSA-0.80.0+MI"),
-    ]
-
-    def is_node_supported(self, node: fx.Node, tosa_spec: TosaSpecification) -> bool:
-        assert node.target in self.targets
-
-        keep_dim = node.args[2] if len(node.args) > 2 else False
-        return cast(bool, keep_dim)
diff --git a/backends/arm/operator_support/tosa_supported_operators.py b/backends/arm/operator_support/tosa_supported_operators.py
index 3563ee9c51..7072ba6a82 100644
--- a/backends/arm/operator_support/tosa_supported_operators.py
+++ b/backends/arm/operator_support/tosa_supported_operators.py
@@ -92,6 +92,7 @@ def is_node_supported(self, submodules, node: fx.Node) -> bool:
             exir_ops.edge.aten.avg_pool2d.default,
             exir_ops.edge.aten.max_pool2d_with_indices.default,
             exir_ops.edge.aten.sigmoid.default,
+            exir_ops.edge.aten.mean.dim,
             exir_ops.edge.aten.mm.default,
             exir_ops.edge.aten.repeat.default,
             exir_ops.edge.aten.reciprocal.default,
@@ -105,6 +106,8 @@ def is_node_supported(self, submodules, node: fx.Node) -> bool:
             exir_ops.edge.aten.sum.dim_IntList,
             exir_ops.edge.aten.tanh.default,
             exir_ops.edge.aten.upsample_nearest2d.vec,
+            exir_ops.edge.aten.var.correction,
+            exir_ops.edge.aten.var.dim,
             exir_ops.edge.aten.view_copy.default,
             exir_ops.edge.aten.clone.default,
             exir_ops.edge.aten.unsqueeze_copy.default,
diff --git a/backends/arm/operator_support/var_correction_support.py b/backends/arm/operator_support/var_correction_support.py
deleted file mode 100644
index 4aa2ae5e97..0000000000
--- a/backends/arm/operator_support/var_correction_support.py
+++ /dev/null
@@ -1,33 +0,0 @@
-# Copyright 2024 Arm Limited and/or its affiliates.
-#
-# This source code is licensed under the BSD-style license found in the
-# LICENSE file in the root directory of this source tree.
-
-# pyre-unsafe
-
-from typing import cast
-
-import torch.fx as fx
-
-from executorch.backends.arm.operator_support.tosa_supported_operators import (
-    register_tosa_support_check,
-    SupportedTOSAOperatorCheck,
-)
-from executorch.backends.arm.tosa_specification import TosaSpecification
-from executorch.exir.dialects._ops import ops as exir_ops
-
-
-@register_tosa_support_check
-class VarCorrectionSupported(SupportedTOSAOperatorCheck):
-    targets = [exir_ops.edge.aten.var.correction]
-
-    tosa_specs = [
-        TosaSpecification.create_from_string("TOSA-0.80.0+BI"),
-        TosaSpecification.create_from_string("TOSA-0.80.0+MI"),
-    ]
-
-    def is_node_supported(self, node: fx.Node, tosa_spec: TosaSpecification) -> bool:
-        assert node.target in self.targets
-
-        keep_dim = node.kwargs.get("keepdim", False)
-        return cast(bool, keep_dim)
diff --git a/backends/arm/test/ops/test_mean_dim.py b/backends/arm/test/ops/test_mean_dim.py
index 3cb8c5f815..e725eb1ef4 100644
--- a/backends/arm/test/ops/test_mean_dim.py
+++ b/backends/arm/test/ops/test_mean_dim.py
@@ -51,7 +51,7 @@ class MeanDim(torch.nn.Module):
         test_data_suite = [
             # (test_name, test_data)
             ("zeros", torch.zeros(1, 1280, 7, 7), -1, True),
-            ("ones", torch.ones(1, 1280, 7, 7), (-1, 2), True),
+            ("ones", torch.ones(1, 1280, 7, 7), (-1, 2), False),
             (
                 "rand",
                 torch.rand(1, 1280, 7, 7),
@@ -62,7 +62,7 @@ class MeanDim(torch.nn.Module):
                 "randn",
                 torch.randn(1, 1280, 7, 7),
                 (-1, -2, -3),
-                True,
+                False,
             ),
         ]
 
diff --git a/backends/arm/test/ops/test_var.py b/backends/arm/test/ops/test_var.py
index 06671848cc..727cd05393 100644
--- a/backends/arm/test/ops/test_var.py
+++ b/backends/arm/test/ops/test_var.py
@@ -29,9 +29,9 @@ class TestVar(unittest.TestCase):
     class Var(torch.nn.Module):
         test_parameters = [
             (torch.randn(1, 50, 10, 20), True, 0),
-            (torch.rand(1, 50, 10), True, 0),
+            (torch.rand(1, 50, 10), False, 0),
             (torch.randn(1, 30, 15, 20), True, 1),
-            (torch.rand(1, 50, 10, 20), True, 0.5),
+            (torch.rand(1, 50, 10, 20), False, 0.5),
         ]
 
         def forward(
@@ -45,9 +45,9 @@ def forward(
     class VarDim(torch.nn.Module):
         test_parameters = [
             (torch.randn(1, 50, 10, 20), 1, True, False),
-            (torch.rand(1, 50, 10), -2, True, False),
+            (torch.rand(1, 50, 10), -2, False, False),
             (torch.randn(1, 30, 15, 20), -3, True, True),
-            (torch.rand(1, 50, 10, 20), -1, True, True),
+            (torch.rand(1, 50, 10, 20), -1, False, True),
         ]
 
         test_parameters_u55 = [
diff --git a/backends/arm/test/passes/test_meandim_to_averagepool2d.py b/backends/arm/test/passes/test_meandim_to_averagepool2d.py
index 615187fb65..978a4c6fe5 100644
--- a/backends/arm/test/passes/test_meandim_to_averagepool2d.py
+++ b/backends/arm/test/passes/test_meandim_to_averagepool2d.py
@@ -68,8 +68,12 @@ def test_tosa_BI_meandim_no_modification(self):
             .quantize()
             .export()
             .to_edge()
-            .check(["executorch_exir_dialects_edge__ops_aten_mean_dim"])
+            .check(["aten_sum_dim_int_list"])
+            .check(["aten_full_default"])
+            .check(["aten_mul_tensor"])
             .run_passes(test_pass_stage)
-            .check(["executorch_exir_dialects_edge__ops_aten_mean_dim"])
+            .check(["aten_sum_dim_int_list"])
+            .check(["aten_full_default"])
+            .check(["aten_mul_tensor"])
             .check_not(["executorch_exir_dialects_edge__ops_aten_avg_pool2d_default"])
         )

From a3476654d72e445680d0daea417673c4b0ddfc0d Mon Sep 17 00:00:00 2001
From: Fredrik Knutsson <fredrik.knutsson.hunnebo@gmail.com>
Date: Tue, 26 Nov 2024 09:41:53 +0100
Subject: [PATCH 26/27] Improvements to Arm backend pytest setup

* Add pytest fixture to ensure randomness on tests
* Move pytest setup to conftest.py to separate pytest commonalities
from general ones
* Minor error catching improvements
* Removed pytest dependency in ArmTester

Change-Id: I9132681d705c1501391f3d4603f5d6f0786db873
---
 backends/arm/test/common.py                   | 180 ++--------------
 backends/arm/test/conftest.py                 | 196 ++++++++++++++++++
 backends/arm/test/misc/test_debug_feats.py    |   8 +-
 .../arm/test/models/test_mobilenet_v2_arm.py  |   6 +-
 backends/arm/test/ops/test_add.py             |   4 +-
 backends/arm/test/ops/test_avg_pool.py        |   4 +-
 backends/arm/test/ops/test_bmm.py             |   6 +-
 backends/arm/test/ops/test_cat.py             |   9 +-
 backends/arm/test/ops/test_clone.py           |   4 +-
 backends/arm/test/ops/test_conv1d.py          |   5 +-
 backends/arm/test/ops/test_conv2d.py          |   5 +-
 backends/arm/test/ops/test_conv_combos.py     |   4 +-
 backends/arm/test/ops/test_depthwise_conv.py  |   6 +-
 backends/arm/test/ops/test_div.py             |   8 +-
 backends/arm/test/ops/test_exp.py             |   4 +-
 backends/arm/test/ops/test_expand.py          |   8 +-
 backends/arm/test/ops/test_full.py            |   8 +-
 backends/arm/test/ops/test_hardtanh.py        |   4 +-
 backends/arm/test/ops/test_layer_norm.py      |   4 +-
 backends/arm/test/ops/test_linear.py          |   4 +-
 backends/arm/test/ops/test_log.py             |   4 +-
 backends/arm/test/ops/test_max_pool.py        |  14 +-
 backends/arm/test/ops/test_mul.py             |   4 +-
 backends/arm/test/ops/test_permute.py         |   6 +-
 backends/arm/test/ops/test_reciprocal.py      |   4 +-
 backends/arm/test/ops/test_sub.py             |   5 +-
 backends/arm/test/runner_utils.py             |   4 +-
 backends/arm/test/tester/arm_tester.py        |  10 +-
 examples/arm/README.md                        |   2 +-
 29 files changed, 283 insertions(+), 247 deletions(-)
 create mode 100644 backends/arm/test/conftest.py

diff --git a/backends/arm/test/common.py b/backends/arm/test/common.py
index 17353cab31..48214a48a7 100644
--- a/backends/arm/test/common.py
+++ b/backends/arm/test/common.py
@@ -4,156 +4,33 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-import logging
+
 import os
-import platform
-import shutil
-import subprocess
-import sys
+
 import tempfile
 from datetime import datetime
-from enum import auto, Enum
 from pathlib import Path
-from typing import Any
-
-import pytest
 
-import torch
+from conftest import is_option_enabled
 
 from executorch.backends.arm.arm_backend import ArmCompileSpecBuilder
 from executorch.exir.backend.compile_spec_schema import CompileSpec
 
 
-class arm_test_options(Enum):
-    quantize_io = auto()
-    corstone300 = auto()
-    dump_path = auto()
-    date_format = auto()
-    fast_fvp = auto()
-
-
-_test_options: dict[arm_test_options, Any] = {}
-
-# ==== Pytest hooks ====
-
-
-def pytest_addoption(parser):
-    parser.addoption("--arm_quantize_io", action="store_true")
-    parser.addoption("--arm_run_corstone300", action="store_true")
-    parser.addoption("--default_dump_path", default=None)
-    parser.addoption("--date_format", default="%d-%b-%H:%M:%S")
-    parser.addoption("--fast_fvp", action="store_true")
-
-
-def pytest_configure(config):
-    if config.option.arm_quantize_io:
-        load_libquantized_ops_aot_lib()
-        _test_options[arm_test_options.quantize_io] = True
-    if config.option.arm_run_corstone300:
-        corstone300_exists = shutil.which("FVP_Corstone_SSE-300_Ethos-U55")
-        if not corstone300_exists:
-            raise RuntimeError(
-                "Tests are run with --arm_run_corstone300 but corstone300 FVP is not installed."
-            )
-        _test_options[arm_test_options.corstone300] = True
-    if config.option.default_dump_path:
-        dump_path = Path(config.option.default_dump_path).expanduser()
-        if dump_path.exists() and os.path.isdir(dump_path):
-            _test_options[arm_test_options.dump_path] = dump_path
-        else:
-            raise RuntimeError(
-                f"Supplied argument 'default_dump_path={dump_path}' that does not exist or is not a directory."
-            )
-    _test_options[arm_test_options.date_format] = config.option.date_format
-    _test_options[arm_test_options.fast_fvp] = config.option.fast_fvp
-    logging.basicConfig(level=logging.INFO, stream=sys.stdout)
-
-
-def pytest_collection_modifyitems(config, items):
-    if not config.option.arm_quantize_io:
-        skip_if_aot_lib_not_loaded = pytest.mark.skip(
-            "u55 tests can only run with quantize_io=True."
-        )
-
-        for item in items:
-            if "u55" in item.name:
-                item.add_marker(skip_if_aot_lib_not_loaded)
-
-
-def pytest_sessionstart(session):
-    pass
-
-
-def pytest_sessionfinish(session, exitstatus):
-    if get_option(arm_test_options.dump_path):
-        _clean_dir(
-            get_option(arm_test_options.dump_path),
-            f"ArmTester_{get_option(arm_test_options.date_format)}.log",
-        )
-
-
-# ==== End of Pytest hooks =====
-
-# ==== Custom Pytest decorators =====
-
-
-def expectedFailureOnFVP(test_item):
-    if is_option_enabled("corstone300"):
-        test_item.__unittest_expecting_failure__ = True
-    return test_item
-
-
-# ==== End of Custom Pytest decorators =====
-
-
-def load_libquantized_ops_aot_lib():
-    so_ext = {
-        "Darwin": "dylib",
-        "Linux": "so",
-        "Windows": "dll",
-    }.get(platform.system(), None)
-
-    find_lib_cmd = [
-        "find",
-        "cmake-out-aot-lib",
-        "-name",
-        f"libquantized_ops_aot_lib.{so_ext}",
-    ]
-    res = subprocess.run(find_lib_cmd, capture_output=True)
-    if res.returncode == 0:
-        library_path = res.stdout.decode().strip()
-        torch.ops.load_library(library_path)
-
-
-def is_option_enabled(
-    option: str | arm_test_options, fail_if_not_enabled: bool = False
-) -> bool:
-    """
-    Returns whether an option is successfully enabled, i.e. if the flag was
-    given to pytest and the necessary requirements are available.
-    Implemented options are:
-        - corstone300.
-        - quantize_io.
-
-    The optional parameter 'fail_if_not_enabled' makes the function raise
-      a RuntimeError instead of returning False.
+def get_time_formatted_path(path: str, log_prefix: str) -> str:
     """
-    if isinstance(option, str):
-        option = arm_test_options[option.lower()]
-
-    if option in _test_options and _test_options[option]:
-        return True
-    else:
-        if fail_if_not_enabled:
-            raise RuntimeError(f"Required option '{option}' for test is not enabled")
-        else:
-            return False
+    Returns the log path with the current time appended to it. Used for debugging.
 
+    Args:
+        path: The path to the folder where the log file will be stored.
+        log_prefix: The name of the test.
 
-def get_option(option: arm_test_options) -> Any | None:
-    if option in _test_options:
-        return _test_options[option]
-    return None
+    Example output:
+        './my_log_folder/test_BI_artifact_28-Nov-14:14:38.log'
+    """
+    return str(
+        Path(path) / f"{log_prefix}_{datetime.now().strftime('%d-%b-%H:%M:%S')}.log"
+    )
 
 
 def maybe_get_tosa_collate_path() -> str | None:
@@ -303,35 +180,6 @@ def get_u85_compile_spec_unbuilt(
     return compile_spec
 
 
-def current_time_formated() -> str:
-    """Return current time as a formated string"""
-    return datetime.now().strftime(get_option(arm_test_options.date_format))
-
-
-def _clean_dir(dir: Path, filter: str, num_save=10):
-    sorted_files: list[tuple[datetime, Path]] = []
-    for file in dir.iterdir():
-        try:
-            creation_time = datetime.strptime(file.name, filter)
-            insert_index = -1
-            for i, to_compare in enumerate(sorted_files):
-                compare_time = to_compare[0]
-                if creation_time < compare_time:
-                    insert_index = i
-                    break
-            if insert_index == -1 and len(sorted_files) < num_save:
-                sorted_files.append((creation_time, file))
-            else:
-                sorted_files.insert(insert_index, (creation_time, file))
-        except ValueError:
-            continue
-
-    if len(sorted_files) > num_save:
-        for remove in sorted_files[0 : len(sorted_files) - num_save]:
-            file = remove[1]
-            file.unlink()
-
-
 def get_target_board(compile_spec: list[CompileSpec]) -> str | None:
     for spec in compile_spec:
         if spec.key == "compile_flags":
diff --git a/backends/arm/test/conftest.py b/backends/arm/test/conftest.py
new file mode 100644
index 0000000000..a94adb9a89
--- /dev/null
+++ b/backends/arm/test/conftest.py
@@ -0,0 +1,196 @@
+# Copyright 2024 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+import os
+import platform
+import random
+import re
+import shutil
+import subprocess
+import sys
+from enum import auto, Enum
+from typing import Any
+
+import pytest
+import torch
+
+"""
+This file contains the pytest hooks, fixtures etc. for the Arm test suite.
+"""
+
+
+class arm_test_options(Enum):
+    quantize_io = auto()
+    corstone_fvp = auto()
+    fast_fvp = auto()
+
+
+_test_options: dict[arm_test_options, Any] = {}
+
+# ==== Pytest hooks ====
+
+
+def pytest_configure(config):
+    if config.option.arm_quantize_io:
+        _load_libquantized_ops_aot_lib()
+        _test_options[arm_test_options.quantize_io] = True
+    if config.option.arm_run_corstoneFVP:
+        corstone300_exists = shutil.which("FVP_Corstone_SSE-300_Ethos-U55")
+        corstone320_exists = shutil.which("FVP_Corstone_SSE-320")
+        if not (corstone300_exists and corstone320_exists):
+            raise RuntimeError(
+                "Tests are run with --arm_run_corstoneFVP but corstone FVP is not installed."
+            )
+        _test_options[arm_test_options.corstone_fvp] = True
+    _test_options[arm_test_options.fast_fvp] = config.option.fast_fvp
+    logging.basicConfig(level=logging.INFO, stream=sys.stdout)
+
+
+def pytest_collection_modifyitems(config, items):
+    """
+    Skip all tests that require run on Ethos-U if the option arm_quantize_io is
+    not set.
+    """
+    if not config.option.arm_quantize_io:
+        skip_if_aot_lib_not_loaded = pytest.mark.skip(
+            "Ethos-U tests can only run on FVP with quantize_io=True."
+        )
+
+        for item in items:
+            if re.search(r"u55|u65|u85", item.name, re.IGNORECASE):
+                item.add_marker(skip_if_aot_lib_not_loaded)
+
+
+def pytest_addoption(parser):
+    parser.addoption("--arm_quantize_io", action="store_true")
+    parser.addoption("--arm_run_corstoneFVP", action="store_true")
+    parser.addoption("--fast_fvp", action="store_true")
+
+
+def pytest_sessionstart(session):
+    pass
+
+
+def pytest_sessionfinish(session, exitstatus):
+    pass
+
+
+# ==== End of Pytest hooks =====
+
+
+# ==== Pytest fixtures =====
+
+
+@pytest.fixture(autouse=True)
+def set_random_seed():
+    """
+    Control random numbers in Arm test suite. Default behavior is random seed,
+    which is set before each test. Use the env variable ARM_TEST_SEED to set the
+    seed you want to use to overrride the default behavior. Or set it to RANDOM
+    if you want to be explicit.
+
+    Examples:
+    As default use random seed for each test
+        ARM_TEST_SEED=RANDOM pytest --config-file=/dev/null --verbose -s --color=yes  backends/arm/test/ops/test_avg_pool.py -k <TESTCASE>
+    Rerun with a specific seed found under a random seed test
+        ARM_TEST_SEED=3478246 pytest --config-file=/dev/null --verbose -s --color=yes  backends/arm/test/ops/test_avg_pool.py -k <TESTCASE>
+    """
+    if os.environ.get("ARM_TEST_SEED", "RANDOM") == "RANDOM":
+        random.seed()  # reset seed, in case any other test has fiddled with it
+        seed = random.randint(0, 2**32 - 1)
+        torch.manual_seed(seed)
+    else:
+        seed_str = os.environ.get("ARM_TEST_SEED", "0")
+        if str.isdigit(seed_str):
+            seed = int(seed_str)
+            random.seed(seed)
+            torch.manual_seed(seed)
+        else:
+            raise TypeError(
+                "ARM_TEST_SEED env variable must be integers or the string RANDOM"
+            )
+
+    print(f" ARM_TEST_SEED={seed} ", end=" ")
+
+
+# ==== End of Pytest fixtures =====
+
+
+# ==== Custom Pytest decorators =====
+
+
+def expectedFailureOnFVP(test_item):
+    if is_option_enabled("corstone_fvp"):
+        test_item.__unittest_expecting_failure__ = True
+    return test_item
+
+
+# ==== End of Custom Pytest decorators =====
+
+
+def is_option_enabled(
+    option: str | arm_test_options, fail_if_not_enabled: bool = False
+) -> bool:
+    """
+    Returns whether an option is successfully enabled, i.e. if the flag was
+    given to pytest and the necessary requirements are available.
+    Implemented options are:
+        - corstone_fvp.
+        - quantize_io.
+
+    The optional parameter 'fail_if_not_enabled' makes the function raise
+      a RuntimeError instead of returning False.
+    """
+    if isinstance(option, str):
+        option = arm_test_options[option.lower()]
+
+    if option in _test_options and _test_options[option]:
+        return True
+    else:
+        if fail_if_not_enabled:
+            raise RuntimeError(f"Required option '{option}' for test is not enabled")
+        else:
+            return False
+
+
+def get_option(option: arm_test_options) -> Any | None:
+    """
+    Returns the value of an pytest option if it is set, otherwise None.
+
+    Args:
+        option (arm_test_options): The option to check for.
+    """
+    if option in _test_options:
+        return _test_options[option]
+    return None
+
+
+def _load_libquantized_ops_aot_lib():
+    """
+    Load the libquantized_ops_aot_lib shared library. It's required when
+    arm_quantize_io is set.
+    """
+    so_ext = {
+        "Darwin": "dylib",
+        "Linux": "so",
+        "Windows": "dll",
+    }.get(platform.system(), None)
+
+    find_lib_cmd = [
+        "find",
+        "cmake-out-aot-lib",
+        "-name",
+        f"libquantized_ops_aot_lib.{so_ext}",
+    ]
+
+    res = subprocess.run(find_lib_cmd, capture_output=True)
+    if res.returncode == 0:
+        library_path = res.stdout.decode().strip()
+        torch.ops.load_library(library_path)
+    else:
+        raise RuntimeError(
+            f"Failed to load libquantized_ops_aot_lib.{so_ext}. Did you build it?"
+        )
diff --git a/backends/arm/test/misc/test_debug_feats.py b/backends/arm/test/misc/test_debug_feats.py
index 4cac39af70..3343ae748c 100644
--- a/backends/arm/test/misc/test_debug_feats.py
+++ b/backends/arm/test/misc/test_debug_feats.py
@@ -80,7 +80,9 @@ def _is_tosa_marker_in_file(self, tmp_file):
 
     def test_MI_artifact(self):
         model = Linear(20, 30)
-        tmp_file = os.path.join(tempfile.mkdtemp(), "tosa_dump_MI.txt")
+        tmp_file = common.get_time_formatted_path(
+            tempfile.mkdtemp(), self._testMethodName
+        )
         self._tosa_MI_pipeline(model, dump_file=tmp_file)
         assert os.path.exists(tmp_file), f"File {tmp_file} was not created"
         if self._is_tosa_marker_in_file(tmp_file):
@@ -89,7 +91,9 @@ def test_MI_artifact(self):
 
     def test_BI_artifact(self):
         model = Linear(20, 30)
-        tmp_file = os.path.join(tempfile.mkdtemp(), "tosa_dump_BI.txt")
+        tmp_file = common.get_time_formatted_path(
+            tempfile.mkdtemp(), self._testMethodName
+        )
         self._tosa_BI_pipeline(model, dump_file=tmp_file)
         assert os.path.exists(tmp_file), f"File {tmp_file} was not created"
         if self._is_tosa_marker_in_file(tmp_file):
diff --git a/backends/arm/test/models/test_mobilenet_v2_arm.py b/backends/arm/test/models/test_mobilenet_v2_arm.py
index 19b4254575..24af9cf41a 100644
--- a/backends/arm/test/models/test_mobilenet_v2_arm.py
+++ b/backends/arm/test/models/test_mobilenet_v2_arm.py
@@ -9,7 +9,7 @@
 import unittest
 
 import torch
-from executorch.backends.arm.test import common
+from executorch.backends.arm.test import common, conftest
 
 from executorch.backends.arm.test.tester.arm_tester import ArmTester
 from executorch.exir import EdgeCompileConfig
@@ -96,7 +96,7 @@ def test_mv2_u55_BI(self):
             .to_executorch()
             .serialize()
         )
-        if common.is_option_enabled("corstone300"):
+        if conftest.is_option_enabled("corstone_fvp"):
             tester.run_method_and_compare_outputs(
                 atol=1.0, qtol=1, inputs=self.model_inputs, target_board="corstone-300"
             )
@@ -114,7 +114,7 @@ def test_mv2_u85_BI(self):
             .to_executorch()
             .serialize()
         )
-        if common.is_option_enabled("corstone300"):
+        if conftest.is_option_enabled("corstone_fvp"):
             tester.run_method_and_compare_outputs(
                 atol=1.0, qtol=1, inputs=self.model_inputs, target_board="corstone-320"
             )
diff --git a/backends/arm/test/ops/test_add.py b/backends/arm/test/ops/test_add.py
index 6676a38add..f40037f62f 100644
--- a/backends/arm/test/ops/test_add.py
+++ b/backends/arm/test/ops/test_add.py
@@ -10,7 +10,7 @@
 from typing import Tuple
 
 import torch
-from executorch.backends.arm.test import common
+from executorch.backends.arm.test import common, conftest
 from executorch.backends.arm.test.tester.arm_tester import ArmTester
 from executorch.exir import EdgeCompileConfig
 from executorch.exir.backend.compile_spec_schema import CompileSpec
@@ -115,7 +115,7 @@ def _test_add_ethos_BI_pipeline(
             .to_executorch()
             .serialize()
         )
-        if common.is_option_enabled("corstone300"):
+        if conftest.is_option_enabled("corstone_fvp"):
             tester.run_method_and_compare_outputs(qtol=1, inputs=test_data)
 
         return tester
diff --git a/backends/arm/test/ops/test_avg_pool.py b/backends/arm/test/ops/test_avg_pool.py
index ad3ddf8c0a..4801849949 100644
--- a/backends/arm/test/ops/test_avg_pool.py
+++ b/backends/arm/test/ops/test_avg_pool.py
@@ -14,7 +14,7 @@
     ArmQuantizer,
     get_symmetric_quantization_config,
 )
-from executorch.backends.arm.test import common
+from executorch.backends.arm.test import common, conftest
 from executorch.backends.arm.test.tester.arm_tester import ArmTester
 from executorch.backends.xnnpack.test.tester.tester import Quantize
 from executorch.exir.backend.backend_details import CompileSpec
@@ -118,7 +118,7 @@ def _test_avgpool2d_tosa_ethos_BI_pipeline(
             .to_executorch()
             .serialize()
         )
-        if common.is_option_enabled("corstone300"):
+        if conftest.is_option_enabled("corstone_fvp"):
             tester.run_method_and_compare_outputs(qtol=1, inputs=test_data)
 
     @parameterized.expand(test_data_suite)
diff --git a/backends/arm/test/ops/test_bmm.py b/backends/arm/test/ops/test_bmm.py
index 523a90cdc8..0952d2595f 100644
--- a/backends/arm/test/ops/test_bmm.py
+++ b/backends/arm/test/ops/test_bmm.py
@@ -9,7 +9,7 @@
 from typing import Tuple
 
 import torch
-from executorch.backends.arm.test import common
+from executorch.backends.arm.test import common, conftest
 from executorch.backends.arm.test.tester.arm_tester import ArmTester
 from executorch.exir.backend.compile_spec_schema import CompileSpec
 from parameterized import parameterized
@@ -112,7 +112,7 @@ def _test_bmm_ethosu_BI_pipeline(
             .to_executorch()
             .serialize()
         )
-        if common.is_option_enabled("corstone300"):
+        if conftest.is_option_enabled("corstone_fvp"):
             tester.run_method_and_compare_outputs(inputs=test_data, qtol=1)
 
     @parameterized.expand(BMM.test_parameters)
@@ -161,7 +161,7 @@ def test_bmm_u85_BI(self, operand1: torch.Tensor, operand2: torch.Tensor):
         )
 
     @parameterized.expand(BMM.test_parameters[1:])
-    @common.expectedFailureOnFVP
+    @conftest.expectedFailureOnFVP
     def test_bmm_u85_BI_xfails(self, operand1: torch.Tensor, operand2: torch.Tensor):
         test_data = (operand1, operand2)
         self._test_bmm_ethosu_BI_pipeline(
diff --git a/backends/arm/test/ops/test_cat.py b/backends/arm/test/ops/test_cat.py
index 88846369d0..bf436a8c18 100644
--- a/backends/arm/test/ops/test_cat.py
+++ b/backends/arm/test/ops/test_cat.py
@@ -10,8 +10,7 @@
 from typing import Tuple
 
 import torch
-from executorch.backends.arm.test import common
-
+from executorch.backends.arm.test import common, conftest
 from executorch.backends.arm.test.tester.arm_tester import ArmTester
 from executorch.exir.backend.compile_spec_schema import CompileSpec
 from parameterized import parameterized
@@ -114,7 +113,7 @@ def _test_cat_ethosu_BI_pipeline(
             .to_executorch()
             .serialize()
         )
-        if common.is_option_enabled("corstone300"):
+        if conftest.is_option_enabled("corstone_fvp"):
             tester.run_method_and_compare_outputs(inputs=test_data)
 
     @parameterized.expand(Cat.test_parameters)
@@ -135,7 +134,7 @@ def test_cat_tosa_BI(self, operands: tuple[torch.Tensor, ...], dim: int):
 
     # Mismatch in provided number of inputs and model signature, MLETORCH 519
     @parameterized.expand(Cat.test_parameters)
-    @common.expectedFailureOnFVP
+    @conftest.expectedFailureOnFVP
     def test_cat_u55_BI(self, operands: tuple[torch.Tensor, ...], dim: int):
         test_data = (operands, dim)
         self._test_cat_ethosu_BI_pipeline(
@@ -144,7 +143,7 @@ def test_cat_u55_BI(self, operands: tuple[torch.Tensor, ...], dim: int):
 
     # Mismatch in provided number of inputs and model signature, MLETORCH 519
     @parameterized.expand(Cat.test_parameters)
-    @common.expectedFailureOnFVP
+    @conftest.expectedFailureOnFVP
     def test_cat_u85_BI(self, operands: tuple[torch.Tensor, ...], dim: int):
         test_data = (operands, dim)
         self._test_cat_ethosu_BI_pipeline(
diff --git a/backends/arm/test/ops/test_clone.py b/backends/arm/test/ops/test_clone.py
index 6b5216a8e1..2e7726a0bc 100644
--- a/backends/arm/test/ops/test_clone.py
+++ b/backends/arm/test/ops/test_clone.py
@@ -17,7 +17,7 @@
     ArmQuantizer,
     get_symmetric_quantization_config,
 )
-from executorch.backends.arm.test import common
+from executorch.backends.arm.test import common, conftest
 from executorch.backends.arm.test.tester.arm_tester import ArmTester
 
 from executorch.backends.xnnpack.test.tester.tester import Quantize
@@ -96,7 +96,7 @@ def _test_clone_tosa_ethos_pipeline(
             .to_executorch()
             .serialize()
         )
-        if common.is_option_enabled("corstone300"):
+        if conftest.is_option_enabled("corstone_fvp"):
             tester.run_method_and_compare_outputs(qtol=1, inputs=test_data)
 
     def _test_clone_tosa_u55_pipeline(
diff --git a/backends/arm/test/ops/test_conv1d.py b/backends/arm/test/ops/test_conv1d.py
index f00c7984a1..e6e027ed6e 100644
--- a/backends/arm/test/ops/test_conv1d.py
+++ b/backends/arm/test/ops/test_conv1d.py
@@ -9,8 +9,7 @@
 from typing import List, Optional, Tuple, Union
 
 import torch
-from executorch.backends.arm.test import common
-
+from executorch.backends.arm.test import common, conftest
 from executorch.backends.arm.test.tester.arm_tester import ArmTester
 from executorch.exir.backend.backend_details import CompileSpec
 from parameterized import parameterized
@@ -279,7 +278,7 @@ def _test_conv1d_ethosu_BI_pipeline(
             .to_executorch()
             .serialize()
         )
-        if common.is_option_enabled("corstone300"):
+        if conftest.is_option_enabled("corstone_fvp"):
             tester.run_method_and_compare_outputs(qtol=1, inputs=test_data)
 
     @parameterized.expand(testsuite)
diff --git a/backends/arm/test/ops/test_conv2d.py b/backends/arm/test/ops/test_conv2d.py
index 21df4bf0d5..222945cd16 100644
--- a/backends/arm/test/ops/test_conv2d.py
+++ b/backends/arm/test/ops/test_conv2d.py
@@ -9,8 +9,7 @@
 from typing import List, Optional, Tuple, Union
 
 import torch
-from executorch.backends.arm.test import common
-
+from executorch.backends.arm.test import common, conftest
 from executorch.backends.arm.test.tester.arm_tester import ArmTester
 from executorch.exir.backend.compile_spec_schema import CompileSpec
 from parameterized import parameterized
@@ -310,7 +309,7 @@ def _test_conv2d_ethosu_BI_pipeline(
             .to_executorch()
             .serialize()
         )
-        if common.is_option_enabled("corstone300"):
+        if conftest.is_option_enabled("corstone_fvp"):
             tester.run_method_and_compare_outputs(qtol=1, inputs=test_data)
 
     @parameterized.expand(testsuite)
diff --git a/backends/arm/test/ops/test_conv_combos.py b/backends/arm/test/ops/test_conv_combos.py
index 001c4a2bd5..86bf9cb632 100644
--- a/backends/arm/test/ops/test_conv_combos.py
+++ b/backends/arm/test/ops/test_conv_combos.py
@@ -12,7 +12,7 @@
 import pytest
 
 import torch
-from executorch.backends.arm.test import common
+from executorch.backends.arm.test import common, conftest
 from executorch.backends.arm.test.tester.arm_tester import ArmTester
 from executorch.exir.backend.backend_details import CompileSpec
 from parameterized import parameterized
@@ -253,7 +253,7 @@ def _test_conv_combo_ethos_BI_pipeline(
             .to_executorch()
             .serialize()
         )
-        if common.is_option_enabled("corstone300"):
+        if conftest.is_option_enabled("corstone_fvp"):
             tester.run_method_and_compare_outputs(qtol=1, inputs=test_data)
 
     ####################
diff --git a/backends/arm/test/ops/test_depthwise_conv.py b/backends/arm/test/ops/test_depthwise_conv.py
index d753245f43..083e9aaf68 100644
--- a/backends/arm/test/ops/test_depthwise_conv.py
+++ b/backends/arm/test/ops/test_depthwise_conv.py
@@ -9,7 +9,7 @@
 from typing import Tuple
 
 import torch
-from executorch.backends.arm.test import common
+from executorch.backends.arm.test import common, conftest
 from executorch.backends.arm.test.ops.test_conv1d import Conv1d
 from executorch.backends.arm.test.ops.test_conv2d import Conv2d
 
@@ -243,7 +243,7 @@ def _test_dw_conv_ethos_BI_pipeline(
             .to_executorch()
             .serialize()
         )
-        if common.is_option_enabled("corstone300"):
+        if conftest.is_option_enabled("corstone_fvp"):
             tester.run_method_and_compare_outputs(qtol=1, inputs=test_data)
 
     @parameterized.expand(testsuite_conv1d + testsuite_conv2d)
@@ -301,7 +301,7 @@ def test_dw_conv_u85_BI(
 
     # All test cases except 3x3_1x3x256x256_gp3_st1 have numerical issues on FVP. MLETORCH-520
     @parameterized.expand(testsuite_conv2d_u85_xfails)
-    @common.expectedFailureOnFVP
+    @conftest.expectedFailureOnFVP
     def test_dw_conv_u85_BI_xfails(
         self, test_name: str, model: torch.nn.Module, set_quantize_io: bool = False
     ):
diff --git a/backends/arm/test/ops/test_div.py b/backends/arm/test/ops/test_div.py
index 27febd714e..eaf6a21023 100644
--- a/backends/arm/test/ops/test_div.py
+++ b/backends/arm/test/ops/test_div.py
@@ -11,7 +11,7 @@
 from typing import Optional, Tuple, Union
 
 import torch
-from executorch.backends.arm.test import common
+from executorch.backends.arm.test import common, conftest
 from executorch.backends.arm.test.tester.arm_tester import ArmTester
 from parameterized import parameterized
 
@@ -157,7 +157,7 @@ def _test_div_ethos_BI_pipeline(
             .to_executorch()
             .serialize()
         )
-        if common.is_option_enabled("corstone300"):
+        if conftest.is_option_enabled("corstone_fvp"):
             tester.run_method_and_compare_outputs(qtol=1, inputs=test_data)
 
     @parameterized.expand(test_data_suite)
@@ -198,7 +198,7 @@ def test_div_u55_BI(
 
     # Numerical issues on FVP likely due to mul op, MLETORCH-521
     @parameterized.expand(test_data_suite[2:])
-    @common.expectedFailureOnFVP
+    @conftest.expectedFailureOnFVP
     def test_div_u55_BI_xfails(
         self,
         test_name: str,
@@ -226,7 +226,7 @@ def test_div_u85_BI(
 
     # Numerical issues on FVP likely due to mul op, MLETORCH-521
     @parameterized.expand(test_data_suite[2:])
-    @common.expectedFailureOnFVP
+    @conftest.expectedFailureOnFVP
     def test_div_u85_BI_xfails(
         self,
         test_name: str,
diff --git a/backends/arm/test/ops/test_exp.py b/backends/arm/test/ops/test_exp.py
index f33e0a9058..57cd23bb14 100644
--- a/backends/arm/test/ops/test_exp.py
+++ b/backends/arm/test/ops/test_exp.py
@@ -10,7 +10,7 @@
 from typing import Tuple
 
 import torch
-from executorch.backends.arm.test import common
+from executorch.backends.arm.test import common, conftest
 from executorch.backends.arm.test.tester.arm_tester import ArmTester
 from executorch.exir.backend.backend_details import CompileSpec
 from parameterized import parameterized
@@ -95,7 +95,7 @@ def _test_exp_ethosu_BI_pipeline(
             .to_executorch()
             .serialize()
         )
-        if common.is_option_enabled("corstone300"):
+        if conftest.is_option_enabled("corstone_fvp"):
             tester.run_method_and_compare_outputs(qtol=1, inputs=test_data)
 
     @parameterized.expand(test_data_suite)
diff --git a/backends/arm/test/ops/test_expand.py b/backends/arm/test/ops/test_expand.py
index 27f311b546..05f72aa379 100644
--- a/backends/arm/test/ops/test_expand.py
+++ b/backends/arm/test/ops/test_expand.py
@@ -17,7 +17,7 @@
     ArmQuantizer,
     get_symmetric_quantization_config,
 )
-from executorch.backends.arm.test import common
+from executorch.backends.arm.test import common, conftest
 from executorch.backends.arm.test.tester.arm_tester import ArmTester
 
 from executorch.backends.xnnpack.test.tester.tester import Quantize
@@ -97,7 +97,7 @@ def _test_expand_ethosu_BI_pipeline(
             .to_executorch()
             .serialize()
         )
-        if common.is_option_enabled("corstone300"):
+        if conftest.is_option_enabled("corstone_fvp"):
             tester.run_method_and_compare_outputs(qtol=1, inputs=test_data)
 
     @parameterized.expand(Expand.test_parameters)
@@ -110,7 +110,7 @@ def test_expand_tosa_BI(self, test_input, multiples):
 
     # Mismatch in provided number of inputs and model signature, MLETORCH 519
     @parameterized.expand(Expand.test_parameters)
-    @common.expectedFailureOnFVP
+    @conftest.expectedFailureOnFVP
     def test_expand_u55_BI(self, test_input, multiples):
         self._test_expand_ethosu_BI_pipeline(
             common.get_u55_compile_spec(), self.Expand(), (test_input, multiples)
@@ -118,7 +118,7 @@ def test_expand_u55_BI(self, test_input, multiples):
 
     # Mismatch in provided number of inputs and model signature, MLETORCH 519
     @parameterized.expand(Expand.test_parameters)
-    @common.expectedFailureOnFVP
+    @conftest.expectedFailureOnFVP
     def test_expand_u85_BI(self, test_input, multiples):
         self._test_expand_ethosu_BI_pipeline(
             common.get_u85_compile_spec(), self.Expand(), (test_input, multiples)
diff --git a/backends/arm/test/ops/test_full.py b/backends/arm/test/ops/test_full.py
index 9857a7b87b..2ee41f8bc1 100644
--- a/backends/arm/test/ops/test_full.py
+++ b/backends/arm/test/ops/test_full.py
@@ -13,7 +13,7 @@
 from typing import Tuple
 
 import torch
-from executorch.backends.arm.test import common
+from executorch.backends.arm.test import common, conftest
 from executorch.backends.arm.test.tester.arm_tester import ArmTester
 from executorch.exir.backend.compile_spec_schema import CompileSpec
 from parameterized import parameterized
@@ -109,7 +109,7 @@ def _test_full_tosa_ethos_pipeline(
             .to_executorch()
             .serialize()
         )
-        if common.is_option_enabled("corstone300"):
+        if conftest.is_option_enabled("corstone_fvp"):
             tester.run_method_and_compare_outputs(qtol=1, inputs=test_data)
 
     def _test_full_tosa_u55_pipeline(self, module: torch.nn.Module, test_data: Tuple):
@@ -145,7 +145,7 @@ def test_full_tosa_BI(self, test_tensor: Tuple):
 
     # Mismatch in provided number of inputs and model signature, MLETORCH 519
     @parameterized.expand(AddVariableFull.test_parameters)
-    @common.expectedFailureOnFVP
+    @conftest.expectedFailureOnFVP
     def test_full_u55_BI(self, test_tensor: Tuple):
         self._test_full_tosa_u55_pipeline(
             self.AddVariableFull(),
@@ -154,7 +154,7 @@ def test_full_u55_BI(self, test_tensor: Tuple):
 
     # Mismatch in provided number of inputs and model signature, MLETORCH 519
     @parameterized.expand(AddVariableFull.test_parameters)
-    @common.expectedFailureOnFVP
+    @conftest.expectedFailureOnFVP
     def test_full_u85_BI(self, test_tensor: Tuple):
         self._test_full_tosa_u85_pipeline(
             self.AddVariableFull(),
diff --git a/backends/arm/test/ops/test_hardtanh.py b/backends/arm/test/ops/test_hardtanh.py
index 10073c5095..1c763e8167 100644
--- a/backends/arm/test/ops/test_hardtanh.py
+++ b/backends/arm/test/ops/test_hardtanh.py
@@ -15,7 +15,7 @@
     get_symmetric_quantization_config,
 )
 
-from executorch.backends.arm.test import common
+from executorch.backends.arm.test import common, conftest
 from executorch.backends.arm.test.tester.arm_tester import ArmTester
 from executorch.backends.xnnpack.test.tester.tester import Quantize
 from parameterized import parameterized
@@ -108,7 +108,7 @@ def _test_hardtanh_tosa_ethosu_BI_pipeline(
             .to_executorch()
             .serialize()
         )
-        if common.is_option_enabled("corstone300"):
+        if conftest.is_option_enabled("corstone_fvp"):
             tester.run_method_and_compare_outputs(qtol=1, inputs=test_data)
 
     @parameterized.expand(test_data_suite)
diff --git a/backends/arm/test/ops/test_layer_norm.py b/backends/arm/test/ops/test_layer_norm.py
index e84dd4ee58..a4d3bc5adf 100644
--- a/backends/arm/test/ops/test_layer_norm.py
+++ b/backends/arm/test/ops/test_layer_norm.py
@@ -8,7 +8,7 @@
 from typing import List, Tuple, Union
 
 import torch
-from executorch.backends.arm.test import common
+from executorch.backends.arm.test import common, conftest
 from executorch.backends.arm.test.tester.arm_tester import ArmTester
 from executorch.exir.backend.backend_details import CompileSpec
 from parameterized import parameterized
@@ -130,7 +130,7 @@ def _test_layernorm_ethosu_BI_pipeline(
             .to_executorch()
             .serialize()
         )
-        if common.is_option_enabled("corstone300"):
+        if conftest.is_option_enabled("corstone_fvp"):
             tester.run_method_and_compare_outputs(qtol=1, inputs=test_data)
 
     @parameterized.expand(test_data_suite)
diff --git a/backends/arm/test/ops/test_linear.py b/backends/arm/test/ops/test_linear.py
index 30d4b2890a..8aabd365af 100644
--- a/backends/arm/test/ops/test_linear.py
+++ b/backends/arm/test/ops/test_linear.py
@@ -11,7 +11,7 @@
 from typing import Tuple
 
 import torch
-from executorch.backends.arm.test import common
+from executorch.backends.arm.test import common, conftest
 
 from executorch.backends.arm.test.tester.arm_tester import ArmTester
 from executorch.exir import EdgeCompileConfig
@@ -247,7 +247,7 @@ def test_linear_tosa_u55_BI(
             test_data,
         )
 
-        if common.is_option_enabled("corstone300"):
+        if conftest.is_option_enabled("corstone_fvp"):
             tester.run_method_and_compare_outputs(qtol=1, inputs=test_data)
 
     @parameterized.expand(test_data_suite_rank1 + test_data_suite_rank4)
diff --git a/backends/arm/test/ops/test_log.py b/backends/arm/test/ops/test_log.py
index 10175d27fb..4dd1fc97c7 100644
--- a/backends/arm/test/ops/test_log.py
+++ b/backends/arm/test/ops/test_log.py
@@ -10,7 +10,7 @@
 from typing import Tuple
 
 import torch
-from executorch.backends.arm.test import common
+from executorch.backends.arm.test import common, conftest
 from executorch.backends.arm.test.tester.arm_tester import ArmTester
 from executorch.exir.backend.backend_details import CompileSpec
 from parameterized import parameterized
@@ -95,7 +95,7 @@ def _test_log_ethosu_BI_pipeline(
             .to_executorch()
             .serialize()
         )
-        if common.is_option_enabled("corstone300"):
+        if conftest.is_option_enabled("corstone_fvp"):
             tester.run_method_and_compare_outputs(qtol=1, inputs=test_data)
 
     @parameterized.expand(test_data_suite)
diff --git a/backends/arm/test/ops/test_max_pool.py b/backends/arm/test/ops/test_max_pool.py
index 41526b1c77..3a12616df6 100644
--- a/backends/arm/test/ops/test_max_pool.py
+++ b/backends/arm/test/ops/test_max_pool.py
@@ -15,7 +15,7 @@
     ArmQuantizer,
     get_symmetric_quantization_config,
 )
-from executorch.backends.arm.test import common
+from executorch.backends.arm.test import common, conftest
 from executorch.backends.arm.test.tester.arm_tester import ArmTester
 
 from executorch.backends.xnnpack.test.tester.tester import Quantize
@@ -171,7 +171,7 @@ def test_maxpool2d_tosa_u55_BI(
             common.get_u55_compile_spec(permute_memory_to_nhwc=True),
             (test_data,),
         )
-        if common.is_option_enabled("corstone300"):
+        if conftest.is_option_enabled("corstone_fvp"):
             tester.run_method_and_compare_outputs(
                 qtol=1, inputs=(test_data,), target_board="corstone-300"
             )
@@ -188,7 +188,7 @@ def test_maxpool2d_tosa_u85_BI(
             common.get_u85_compile_spec(permute_memory_to_nhwc=True),
             (test_data,),
         )
-        if common.is_option_enabled("corstone300"):
+        if conftest.is_option_enabled("corstone_fvp"):
             tester.run_method_and_compare_outputs(
                 qtol=1, inputs=(test_data,), target_board="corstone-320"
             )
@@ -216,7 +216,7 @@ def test_maxpool2d_tosa_BI_mult_batches(
         )
 
     @parameterized.expand(test_data_suite_mult_batches)
-    @common.expectedFailureOnFVP  # TODO: MLETORCH-433
+    @conftest.expectedFailureOnFVP  # TODO: MLETORCH-433
     def test_maxpool2d_tosa_u55_BI_mult_batches(
         self,
         test_name: str,
@@ -228,13 +228,13 @@ def test_maxpool2d_tosa_u55_BI_mult_batches(
             common.get_u55_compile_spec(permute_memory_to_nhwc=True),
             (test_data,),
         )
-        if common.is_option_enabled("corstone300"):
+        if conftest.is_option_enabled("corstone_fvp"):
             tester.run_method_and_compare_outputs(
                 qtol=1, inputs=(test_data,), target_board="corstone-300"
             )
 
     @parameterized.expand(test_data_suite_mult_batches)
-    @common.expectedFailureOnFVP  # TODO: MLETORCH-433
+    @conftest.expectedFailureOnFVP  # TODO: MLETORCH-433
     def test_maxpool2d_tosa_u85_BI_mult_batches(
         self,
         test_name: str,
@@ -246,7 +246,7 @@ def test_maxpool2d_tosa_u85_BI_mult_batches(
             common.get_u85_compile_spec(permute_memory_to_nhwc=True),
             (test_data,),
         )
-        if common.is_option_enabled("corstone300"):
+        if conftest.is_option_enabled("corstone_fvp"):
             tester.run_method_and_compare_outputs(
                 qtol=1, inputs=(test_data,), target_board="corstone-320"
             )
diff --git a/backends/arm/test/ops/test_mul.py b/backends/arm/test/ops/test_mul.py
index 6d6922628e..ced71b0072 100644
--- a/backends/arm/test/ops/test_mul.py
+++ b/backends/arm/test/ops/test_mul.py
@@ -8,7 +8,7 @@
 import unittest
 
 import torch
-from executorch.backends.arm.test import common
+from executorch.backends.arm.test import common, conftest
 from executorch.backends.arm.test.tester.arm_tester import ArmTester
 from executorch.exir.backend.backend_details import CompileSpec
 from parameterized import parameterized
@@ -128,7 +128,7 @@ def _test_mul_ethosu_BI_pipeline(
             .to_executorch()
             .serialize()
         )
-        if common.is_option_enabled("corstone300"):
+        if conftest.is_option_enabled("corstone_fvp"):
             tester.run_method_and_compare_outputs(qtol=1, inputs=test_data)
 
     @parameterized.expand(test_data_sute)
diff --git a/backends/arm/test/ops/test_permute.py b/backends/arm/test/ops/test_permute.py
index 92400215b7..581cd3cfbc 100644
--- a/backends/arm/test/ops/test_permute.py
+++ b/backends/arm/test/ops/test_permute.py
@@ -15,7 +15,7 @@
     get_symmetric_quantization_config,
 )
 
-from executorch.backends.arm.test import common
+from executorch.backends.arm.test import common, conftest
 from executorch.backends.arm.test.tester.arm_tester import ArmTester
 from executorch.backends.xnnpack.test.tester.tester import Quantize
 from executorch.exir.backend.compile_spec_schema import CompileSpec
@@ -117,7 +117,7 @@ def _test_permute_ethos_BI_pipeline(
             .to_executorch()
             .serialize()
         )
-        if common.is_option_enabled("corstone300"):
+        if conftest.is_option_enabled("corstone_fvp"):
             tester.run_method_and_compare_outputs(qtol=1, inputs=test_data)
 
     @parameterized.expand(test_data_suite)
@@ -155,7 +155,7 @@ def test_permute_u85_BI(
 
     # Fails since on FVP since N > 1 is not supported. MLETORCH-517
     @parameterized.expand(test_data_suite[-2:])
-    @common.expectedFailureOnFVP
+    @conftest.expectedFailureOnFVP
     def test_permute_u85_BI_xfails(
         self, test_name: str, test_data: torch.Tensor, dims: list[int]
     ):
diff --git a/backends/arm/test/ops/test_reciprocal.py b/backends/arm/test/ops/test_reciprocal.py
index 876f063c76..a71396caf3 100644
--- a/backends/arm/test/ops/test_reciprocal.py
+++ b/backends/arm/test/ops/test_reciprocal.py
@@ -7,7 +7,7 @@
 import unittest
 
 import torch
-from executorch.backends.arm.test import common
+from executorch.backends.arm.test import common, conftest
 from executorch.backends.arm.test.tester.arm_tester import ArmTester
 from parameterized import parameterized
 
@@ -97,7 +97,7 @@ def _test_reciprocal_u55_BI_pipeline(
             .to_executorch()
             .serialize()
         )
-        if common.is_option_enabled("corstone300"):
+        if conftest.is_option_enabled("corstone_fvp"):
             tester.run_method_and_compare_outputs(qtol=1, inputs=test_data)
 
     @parameterized.expand(test_data_suite)
diff --git a/backends/arm/test/ops/test_sub.py b/backends/arm/test/ops/test_sub.py
index 327a8de994..0592141028 100644
--- a/backends/arm/test/ops/test_sub.py
+++ b/backends/arm/test/ops/test_sub.py
@@ -10,8 +10,7 @@
 from typing import Tuple
 
 import torch
-from executorch.backends.arm.test import common
-
+from executorch.backends.arm.test import common, conftest
 from executorch.backends.arm.test.tester.arm_tester import ArmTester
 from executorch.exir.backend.compile_spec_schema import CompileSpec
 from parameterized import parameterized
@@ -98,7 +97,7 @@ def _test_sub_ethosu_BI_pipeline(
             .to_executorch()
             .serialize()
         )
-        if common.is_option_enabled("corstone300"):
+        if conftest.is_option_enabled("corstone_fvp"):
             tester.run_method_and_compare_outputs(qtol=1, inputs=test_data)
 
     @parameterized.expand(Sub.test_parameters)
diff --git a/backends/arm/test/runner_utils.py b/backends/arm/test/runner_utils.py
index b61c1b465f..a8a113cf93 100644
--- a/backends/arm/test/runner_utils.py
+++ b/backends/arm/test/runner_utils.py
@@ -17,7 +17,7 @@
 import numpy as np
 import torch
 
-from executorch.backends.arm.test.common import arm_test_options, is_option_enabled
+from executorch.backends.arm.test.conftest import arm_test_options, is_option_enabled
 
 from torch.export import ExportedProgram
 from torch.fx.node import Node
@@ -218,7 +218,7 @@ def run_corstone(
 
         assert (
             self._has_init_run
-        ), "RunnerUtil needs to be initialized using init_run() before running Corstone300."
+        ), "RunnerUtil needs to be initialized using init_run() before running Corstone FVP."
         if self.target_board not in ["corstone-300", "corstone-320"]:
             raise RuntimeError(f"Unknown target board: {self.target_board}")
 
diff --git a/backends/arm/test/tester/arm_tester.py b/backends/arm/test/tester/arm_tester.py
index 3564a3325a..6784605bb4 100644
--- a/backends/arm/test/tester/arm_tester.py
+++ b/backends/arm/test/tester/arm_tester.py
@@ -22,12 +22,7 @@
     ArmQuantizer,
     get_symmetric_quantization_config,
 )
-from executorch.backends.arm.test.common import (
-    arm_test_options,
-    current_time_formated,
-    get_option,
-    get_target_board,
-)
+from executorch.backends.arm.test.common import get_target_board
 
 from executorch.backends.arm.test.runner_utils import (
     _get_input_quantization_params,
@@ -626,9 +621,6 @@ def _get_tosa_operator_distribution(
 
 
 def _dump_str(to_print: str, path_to_dump: Optional[str] = None):
-    default_dump_path = get_option(arm_test_options.dump_path)
-    if not path_to_dump and default_dump_path:
-        path_to_dump = default_dump_path / f"ArmTester_{current_time_formated()}.log"
     if path_to_dump:
         with open(path_to_dump, "a") as fp:
             fp.write(to_print)
diff --git a/examples/arm/README.md b/examples/arm/README.md
index 717a96c13e..bb68ef537b 100644
--- a/examples/arm/README.md
+++ b/examples/arm/README.md
@@ -24,7 +24,7 @@ To run these scripts. On a Linux system, in a terminal, with a working internet
 $ ./setup.sh --i-agree-to-the-contained-eula [optional-scratch-dir]
 
 # Step [2] - build + run ExecuTorch and executor_runner baremetal application
-# suited for Corstone300 to run a simple PyTorch model.
+# suited for Corstone FVP's to run a simple PyTorch model.
 $ ./run.sh [--scratch-dir=same-optional-scratch-dir-as-before]
 ```
 ### Online Tutorial

From 2d499b3d0cf0b085e373404e5ff421a73a4a22b4 Mon Sep 17 00:00:00 2001
From: Tarun Karuturi <58826100+tarun292@users.noreply.github.com>
Date: Mon, 2 Dec 2024 01:55:31 -0800
Subject: [PATCH 27/27] Add quantize_per_channel and dequantize_per_channel to
 q_dq_ops target

Differential Revision: D66400800

Pull Request resolved: https://github.com/pytorch/executorch/pull/7045
---
 kernels/quantized/targets.bzl | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/kernels/quantized/targets.bzl b/kernels/quantized/targets.bzl
index 13ef166ece..5440400612 100644
--- a/kernels/quantized/targets.bzl
+++ b/kernels/quantized/targets.bzl
@@ -69,6 +69,8 @@ def define_common_targets():
                 "quantized_decomposed::dequantize_per_tensor.Tensor_out",
                 "quantized_decomposed::quantize_per_tensor.out",
                 "quantized_decomposed::quantize_per_tensor.Tensor_out",
+                "quantized_decomposed::dequantize_per_channel.out",
+                "quantized_decomposed::quantize_per_channel.out",
             ],
     )