From 1c7d94e3aa6fee30b3a4f618da9cd90129bc1633 Mon Sep 17 00:00:00 2001 From: Hansong <107070759+kirklandsign@users.noreply.github.com> Date: Tue, 26 Nov 2024 15:24:56 -0800 Subject: [PATCH 01/27] Rollout ghstack_land bot to everyone Now it's in a good shape, let's expand it to everyone. Pull Request resolved: https://github.com/pytorch/executorch/pull/7092 Original discussion: https://github.com/pytorch/executorch/pull/6270#discussion_r1805490087 --- .github/workflows/ghstack_land.yml | 16 +--------------- 1 file changed, 1 insertion(+), 15 deletions(-) diff --git a/.github/workflows/ghstack_land.yml b/.github/workflows/ghstack_land.yml index e3b02d2a94..09bd2a7ced 100644 --- a/.github/workflows/ghstack_land.yml +++ b/.github/workflows/ghstack_land.yml @@ -3,21 +3,7 @@ on: pull_request: types: [closed] branches: - - 'gh/cccclai/[0-9]+/base' - - 'gh/dbort/[0-9]+/base' - - 'gh/dvorjackz/[0-9]+/base' - - 'gh/guangy10/[0-9]+/base' - - 'gh/helunwencser/[0-9]+/base' - - 'gh/jorgep31415/[0-9]+/base' - - 'gh/kimishpatel/[0-9]+/base' - - 'gh/kirklandsign/[0-9]+/base' - - 'gh/larryliu0820/[0-9]+/base' - - 'gh/lucylq/[0-9]+/base' - - 'gh/manuelcandales/[0-9]+/base' - - 'gh/mcr229/[0-9]+/base' - - 'gh/swolchok/[0-9]+/base' - - 'gh/SS-JIA/[0-9]+/base' - - 'gh/trivedivivek/[0-9]+/base' + - 'gh/*/[0-9]+/base' jobs: ghstack_merge_to_main: From 9b29b4b8ee2a52972480dea05956a3350a78ef1d Mon Sep 17 00:00:00 2001 From: George Hong Date: Tue, 26 Nov 2024 16:20:16 -0800 Subject: [PATCH 02/27] Update training module to have super class methods accessible (#7082) Update training module to have super class methods accessible (#7082) Summary: This is needed so the training module has access to non-training methods (e.g. constant string return methods). Reviewed By: JacobSzwejbka Differential Revision: D66419247 --- extension/training/module/training_module.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/extension/training/module/training_module.h b/extension/training/module/training_module.h index b31463a68f..9e7aa49cac 100644 --- a/extension/training/module/training_module.h +++ b/extension/training/module/training_module.h @@ -26,7 +26,8 @@ namespace training { * A facade class for loading programs for on-device training and executing * methods within them. */ -class ET_EXPERIMENTAL TrainingModule final : executorch::extension::Module { +class ET_EXPERIMENTAL TrainingModule final + : public executorch::extension::Module { public: explicit TrainingModule( std::unique_ptr data_loader, From dedf77bd3082756c6ff13a16e1265f3f481bc1ed Mon Sep 17 00:00:00 2001 From: Mengwei Liu Date: Tue, 26 Nov 2024 16:43:09 -0800 Subject: [PATCH 03/27] Fix shared library rpath once for all (#7096) Test Summary: Test Plan: Reviewers: Subscribers: Tasks: Tags: --- CMakeLists.txt | 56 +++++++------------------ extension/llm/custom_ops/CMakeLists.txt | 21 ---------- 2 files changed, 16 insertions(+), 61 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index b3b80b4e41..f960dced37 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -682,6 +682,22 @@ if(EXECUTORCH_BUILD_PTHREADPOOL endif() if(EXECUTORCH_BUILD_PYBIND) + # Setup RPATH. + # See https://gitlab.kitware.com/cmake/community/-/wikis/doc/cmake/RPATH-handling + if(APPLE) + set(CMAKE_MACOSX_RPATH ON) + set(_rpath_portable_origin "@loader_path") + else() + set(_rpath_portable_origin $ORIGIN) + endif(APPLE) + # Use separate rpaths during build and install phases + set(CMAKE_SKIP_BUILD_RPATH FALSE) + # Don't use the install-rpath during the build phase + set(CMAKE_BUILD_WITH_INSTALL_RPATH FALSE) + set(CMAKE_INSTALL_RPATH "${_rpath_portable_origin}") + # Automatically add all linked folders that are NOT in the build directory to + # the rpath (per library?) + set(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE) add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/third-party/pybind11) if(NOT EXECUTORCH_BUILD_EXTENSION_DATA_LOADER) @@ -765,46 +781,6 @@ if(EXECUTORCH_BUILD_PYBIND) target_include_directories(portable_lib PRIVATE ${TORCH_INCLUDE_DIRS}) target_compile_options(portable_lib PUBLIC ${_pybind_compile_options}) target_link_libraries(portable_lib PRIVATE ${_dep_libs}) - if(APPLE) - # pip wheels will need to be able to find the torch libraries. On Linux, the - # .so has non-absolute dependencies on libs like "libtorch.so" without - # paths; as long as we `import torch` first, those dependencies will work. - # But Apple dylibs do not support non-absolute dependencies, so we need to - # tell the loader where to look for its libraries. The LC_LOAD_DYLIB entries - # for the torch libraries will look like "@rpath/libtorch.dylib", so we can - # add an LC_RPATH entry to look in a directory relative to the installed - # location of our _portable_lib.so file. To see these LC_* values, run - # `otool -l _portable_lib*.so`. - set_target_properties( - portable_lib - PROPERTIES # Assume that this library will be installed in - # `site-packages/executorch/extension/pybindings`, and that - # the torch libs are in `site-packages/torch/lib`. - BUILD_RPATH "@loader_path/../../../torch/lib" - INSTALL_RPATH "@loader_path/../../../torch/lib" - # Assume is the root `site-packages/executorch` - # Need to add /extension/llm/custom_ops for - # libcustom_ops_aot_lib.dylib - BUILD_RPATH "@loader_path/../../extension/llm/custom_ops" - INSTALL_RPATH "@loader_path/../../extension/llm/custom_ops" - # Need to add /kernels/quantized for - # libquantized_ops_aot_lib.dylib - BUILD_RPATH "@loader_path/../../kernels/quantized" - INSTALL_RPATH "@loader_path/../../kernels/quantized" - ) - else() - set_target_properties( - portable_lib - PROPERTIES - # Assume is the root `site-packages/executorch` - # Need to add /extension/llm/custom_ops for - # libcustom_ops_aot_lib - # Need to add /kernels/quantized for - # libquantized_ops_aot_lib - BUILD_RPATH - "$ORIGIN:$ORIGIN/../../extension/llm/custom_ops:$ORIGIN/../../kernels/quantized" - ) - endif() install(TARGETS portable_lib LIBRARY DESTINATION executorch/extension/pybindings diff --git a/extension/llm/custom_ops/CMakeLists.txt b/extension/llm/custom_ops/CMakeLists.txt index 36b03a480f..811eb87ac6 100644 --- a/extension/llm/custom_ops/CMakeLists.txt +++ b/extension/llm/custom_ops/CMakeLists.txt @@ -109,26 +109,5 @@ if(EXECUTORCH_BUILD_KERNELS_CUSTOM_AOT) ${_common_compile_options} -DET_USE_THREADPOOL ) - # pip wheels will need to be able to find the dependent libraries. On Linux, - # the .so has non-absolute dependencies on libs like "_portable_lib.so" - # without paths; as long as we `import torch` first, those dependencies will - # work. But Apple dylibs do not support non-absolute dependencies, so we need - # to tell the loader where to look for its libraries. The LC_LOAD_DYLIB - # entries for the portable_lib libraries will look like - # "@rpath/_portable_lib.cpython-310-darwin.so", so we can add an LC_RPATH - # entry to look in a directory relative to the installed location of our - # _portable_lib.so file. To see these LC_* values, run `otool -l - # libcustom_ops_aot_lib.dylib`. - if(APPLE) - set_target_properties( - custom_ops_aot_lib - PROPERTIES # Assume this library will be installed in - # /executorch/extension/llm/custom_ops/, and the - # _portable_lib.so is installed in - # /executorch/extension/pybindings/ - BUILD_RPATH "@loader_path/../../pybindings" - INSTALL_RPATH "@loader_path/../../pybindings" - ) - endif() install(TARGETS custom_ops_aot_lib DESTINATION lib) endif() From 5785fc3e80bddf1af04fda270b869881363e3308 Mon Sep 17 00:00:00 2001 From: JP <46308822+zonglinpeng@users.noreply.github.com> Date: Tue, 26 Nov 2024 17:46:17 -0800 Subject: [PATCH 04/27] add unit test for op_add (#7087) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit add op_add shapes to generate as binaries (#7087) Summary: generates the add model pte’s for cadence to execute on. will use graph builder in later diffs Test Plan: Imported from GitHub, without a `Test Plan:` line. {F1968254537} Reviewed By: hsharma35 Differential Revision: D66510372 Pulled By: zonglinpeng --- backends/cadence/aot/TARGETS | 20 ++++ backends/cadence/aot/export_example.py | 14 +-- backends/cadence/aot/utils.py | 3 +- backends/cadence/runtime/TARGETS | 2 + examples/cadence/operators/TARGETS | 26 +++++ examples/cadence/operators/test_add_op.py | 115 ++++++++++++++++++++++ 6 files changed, 173 insertions(+), 7 deletions(-) create mode 100644 examples/cadence/operators/TARGETS create mode 100644 examples/cadence/operators/test_add_op.py diff --git a/backends/cadence/aot/TARGETS b/backends/cadence/aot/TARGETS index 24b0266911..661f8cf0d4 100644 --- a/backends/cadence/aot/TARGETS +++ b/backends/cadence/aot/TARGETS @@ -50,6 +50,26 @@ python_library( ], ) +python_library( + name = "export_example", + srcs = [ + "export_example.py", + ], + deps = [ + ":passes", + ":utils", + ":ops_registrations", + ":replace_ops", + "//caffe2:torch", + "//executorch/backends/cadence/aot/quantizer:fusion_pass", + "//executorch/backends/cadence/runtime:runtime", + "//executorch/backends/cadence/aot/quantizer:quantizer", + "//executorch/backends/transforms:decompose_sdpa", + "//executorch/backends/transforms:remove_clone_ops", + "//executorch/exir:lib", + "//executorch/devtools:lib", + ], +) python_library( name = "pass_utils", diff --git a/backends/cadence/aot/export_example.py b/backends/cadence/aot/export_example.py index 146d4f806c..4ba5bffc96 100644 --- a/backends/cadence/aot/export_example.py +++ b/backends/cadence/aot/export_example.py @@ -60,6 +60,7 @@ def export_model( model: nn.Module, example_inputs: Tuple[Any, ...], file_name: str = "CadenceDemoModel", + run_and_compare: bool = True, ): # create work directory for outputs and model binary working_dir = tempfile.mkdtemp(dir="/tmp") @@ -112,9 +113,10 @@ def export_model( ) # TODO: move to test infra - runtime.run_and_compare( - executorch_prog=exec_prog, - inputs=example_inputs, - ref_outputs=ref_outputs, - working_dir=working_dir, - ) + if run_and_compare: + runtime.run_and_compare( + executorch_prog=exec_prog, + inputs=example_inputs, + ref_outputs=ref_outputs, + working_dir=working_dir, + ) diff --git a/backends/cadence/aot/utils.py b/backends/cadence/aot/utils.py index e8b64ef567..534b4f0d9f 100644 --- a/backends/cadence/aot/utils.py +++ b/backends/cadence/aot/utils.py @@ -162,7 +162,8 @@ def print_ops_info( # Print the final ops and their counts in a tabular format logging.info( - tabulate( + "\n" + + tabulate( sorted_ops_count, headers=[ "Final Operators ", # one character longer than the longest op name diff --git a/backends/cadence/runtime/TARGETS b/backends/cadence/runtime/TARGETS index 1b55a7d541..db3fe0ad1e 100644 --- a/backends/cadence/runtime/TARGETS +++ b/backends/cadence/runtime/TARGETS @@ -7,6 +7,8 @@ python_library( srcs = [ "__init__.py", "executor.py", + "runtime.py", + "utils.py" ] + glob([ "xtsc-cfg/**/*", ]), diff --git a/examples/cadence/operators/TARGETS b/examples/cadence/operators/TARGETS new file mode 100644 index 0000000000..732f1ced09 --- /dev/null +++ b/examples/cadence/operators/TARGETS @@ -0,0 +1,26 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +load("@fbcode_macros//build_defs:python_unittest.bzl", "python_unittest") + +oncall("odai_jarvis") + + +python_unittest( + name = "test_add_op", + srcs = [ + "test_add_op.py", + ], + typing = True, + supports_static_listing = False, + deps = [ + "fbsource//third-party/pypi/parameterized:parameterized", + "//caffe2:torch", + "//executorch/backends/cadence/aot:ops_registrations", + "//executorch/backends/cadence/aot:export_example", + "//executorch/backends/cadence/aot:compiler", + ], +) diff --git a/examples/cadence/operators/test_add_op.py b/examples/cadence/operators/test_add_op.py new file mode 100644 index 0000000000..5481540b4f --- /dev/null +++ b/examples/cadence/operators/test_add_op.py @@ -0,0 +1,115 @@ +# (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. + +import unittest +from typing import Tuple + +from parameterized import parameterized + +from executorch.backends.cadence.aot.ops_registrations import * # noqa + +import torch +import torch.nn as nn +from executorch.backends.cadence.aot.export_example import export_model + + +class ATenOpTestCases(unittest.TestCase): + @parameterized.expand( + [ + [(7, 5, 6), (7, 5, 6)], + [(7, 5, 6), (1)], + [(1), (7, 5, 6)], + [(1), (7, 5, 6), 2.23], + [(1), (7, 5, 6), -1.0], + [(1), (7, 5, 6), -2.23], + [(7, 5, 6), (7, 5, 6), 1.23], + [(6, 7), (6, 7)], + [(6, 7), (6, 7), 2], + # Broadcast tests (should be optimized on G3) + [(1, 32, 64), (1, 1, 64)], + [(1, 32, 64), (64)], + [(1, 1, 32), (32)], + [(16, 1, 16), (1, 1, 16)], + [(16, 1, 16), (16)], + [(1, 4, 8, 8), (1, 1, 8, 8)], + [(1, 4, 8, 8), (8, 8)], + # Broadcast tests (should go to portable ops) + [(1, 10, 1, 8), (4, 1, 4, 1)], + [(1, 1, 16), (1, 8, 1), 2.5], + # # aten.upsample_nearest2d tests + [(5, 6, 6, 8), (5, 6, 6, 8)], + [(1, 1, 12, 16), (1, 1, 12, 16)], + ] + ) + def test_aten_add_out( + self, Xshape: Tuple[int], Yshape: Tuple[int], alpha: float = 1 + ) -> None: + class AddTensor(nn.Module): + def __init__(self, alpha: float): + super().__init__() + self.alpha = alpha + + def forward(self, x: torch.Tensor, y: torch.Tensor): + return torch.add(x, y, alpha=self.alpha) + + model = AddTensor(alpha) + + X = torch.randn(Xshape) + Y = torch.randn(Yshape) + + model.eval() + export_model( + model, (X, Y), file_name=self._testMethodName, run_and_compare=False + ) + + @parameterized.expand( + [ + [(7, 5, 6), (7, 5, 6)], + [(7, 5, 6), (1)], + [(1), (7, 5, 6)], + [(1), (7, 5, 6), 2.23], + [(1), (7, 5, 6), -1.0], + [(1), (7, 5, 6), -2.23], + [(7, 5, 6), (7, 5, 6), 1.23], + [(6, 7), (6, 7)], + [(6, 7), (6, 7), 2], + # Broadcast tests (should be optimized on G3) + [(1, 32, 64), (1, 1, 64)], + [(1, 32, 64), (64)], + [(1, 1, 32), (32)], + [(16, 1, 16), (1, 1, 16)], + [(16, 1, 16), (16)], + [(1, 4, 8, 8), (1, 1, 8, 8)], + [(1, 4, 8, 8), (8, 8)], + # Broadcast tests (should go to portable ops) + [(1, 10, 1, 8), (4, 1, 4, 1)], + [(1, 1, 16), (1, 8, 1), 2.5], + # # aten.upsample_nearest2d tests + [(5, 6, 6, 8), (5, 6, 6, 8)], + [(1, 1, 12, 16), (1, 1, 12, 16)], + ] + ) + def test_aten_add_scalar_out( + self, Xshape: Tuple[int], Yshape: Tuple[int], alpha: float = 1 + ) -> None: + # Tensor-Scalar addition + class AddScalar(nn.Module): + def __init__(self, alpha: float): + super().__init__() + self.alpha = alpha + + def forward(self, x: torch.Tensor, y: float): + return torch.add(x, y, alpha=self.alpha) + + model = AddScalar(alpha) + + X = torch.randn(Xshape) + Y = 2.34 + + model.eval() + export_model( + model, (X, Y), file_name=self._testMethodName, run_and_compare=False + ) + + +if __name__ == "__main__": + unittest.main() From d679ad70a7745c60df581d6b110e6f79c389feb9 Mon Sep 17 00:00:00 2001 From: Max Ren <40742183+mcr229@users.noreply.github.com> Date: Wed, 27 Nov 2024 02:00:41 -0500 Subject: [PATCH 05/27] Update XNNPACK to 1ed874e65 (#6538) * Update XNNPACK to c88c8504fd9889c22391f0f3ece6061a7f855cf3 fix bug * Update test_llama.sh and test_llava.sh to use release mode as default --- .ci/scripts/test_llama.sh | 11 +++++++---- .ci/scripts/test_llava.sh | 16 ++++++++-------- .github/workflows/trunk.yml | 2 +- backends/xnnpack/third-party/XNNPACK | 2 +- backends/xnnpack/third-party/xnnpack.buck.bzl | 6 ++++-- .../xnnpack/third-party/xnnpack_src_defs.bzl | 12 +----------- 6 files changed, 22 insertions(+), 27 deletions(-) diff --git a/.ci/scripts/test_llama.sh b/.ci/scripts/test_llama.sh index e109845547..5e5ed588a2 100644 --- a/.ci/scripts/test_llama.sh +++ b/.ci/scripts/test_llama.sh @@ -51,6 +51,9 @@ UPLOAD_DIR="${UPLOAD_DIR:-}" # Default PT2E_QUANTIZE to empty string if not set PT2E_QUANTIZE="${PT2E_QUANTIZE:-}" +# Default CMake Build Type to release mode +CMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE:-Release} + if [[ $# -lt 4 ]]; then # Assuming 4 mandatory args echo "Expecting atleast 4 positional arguments" echo "Usage: [...]" @@ -143,7 +146,7 @@ cmake_install_executorch_libraries() { rm -rf cmake-out retry cmake \ -DCMAKE_INSTALL_PREFIX=cmake-out \ - -DCMAKE_BUILD_TYPE=Debug \ + -DCMAKE_BUILD_TYPE="$CMAKE_BUILD_TYPE" \ -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \ -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \ -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \ @@ -157,7 +160,7 @@ cmake_install_executorch_libraries() { -DQNN_SDK_ROOT="$QNN_SDK_ROOT" \ -DPYTHON_EXECUTABLE="$PYTHON_EXECUTABLE" \ -Bcmake-out . - cmake --build cmake-out -j9 --target install --config Debug + cmake --build cmake-out -j9 --target install --config "$CMAKE_BUILD_TYPE" } cmake_build_llama_runner() { @@ -165,14 +168,14 @@ cmake_build_llama_runner() { dir="examples/models/llama" retry cmake \ -DCMAKE_INSTALL_PREFIX=cmake-out \ - -DCMAKE_BUILD_TYPE=Debug \ + -DCMAKE_BUILD_TYPE="$CMAKE_BUILD_TYPE" \ -DEXECUTORCH_BUILD_KERNELS_CUSTOM="$CUSTOM" \ -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \ -DEXECUTORCH_BUILD_XNNPACK="$XNNPACK" \ -DPYTHON_EXECUTABLE="$PYTHON_EXECUTABLE" \ -Bcmake-out/${dir} \ ${dir} - cmake --build cmake-out/${dir} -j9 --config Debug + cmake --build cmake-out/${dir} -j9 --config "$CMAKE_BUILD_TYPE" } diff --git a/.ci/scripts/test_llava.sh b/.ci/scripts/test_llava.sh index 1057fa8f4a..a30143d895 100644 --- a/.ci/scripts/test_llava.sh +++ b/.ci/scripts/test_llava.sh @@ -8,11 +8,11 @@ set -exu # shellcheck source=/dev/null -BUILD_TYPE=${1:-Debug} TARGET_OS=${2:-Native} BUILD_DIR=${3:-cmake-out} +CMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE:-Release} -echo "Building with BUILD_TYPE: $BUILD_TYPE, TARGET_OS: $TARGET_OS, BUILD_DIR: $BUILD_DIR" +echo "Building with CMAKE_BUILD_TYPE: $CMAKE_BUILD_TYPE, TARGET_OS: $TARGET_OS, BUILD_DIR: $BUILD_DIR" if [[ -z "${PYTHON_EXECUTABLE:-}" ]]; then PYTHON_EXECUTABLE=python3 @@ -32,7 +32,7 @@ if hash nproc &> /dev/null; then NPROC=$(nproc); fi EXECUTORCH_COMMON_CMAKE_ARGS=" \ -DCMAKE_INSTALL_PREFIX=${BUILD_DIR} \ - -DCMAKE_BUILD_TYPE=${BUILD_TYPE} \ + -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} \ -DEXECUTORCH_ENABLE_LOGGING=ON \ -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \ -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \ @@ -49,7 +49,7 @@ cmake_install_executorch_libraries() { ${EXECUTORCH_COMMON_CMAKE_ARGS} \ -B${BUILD_DIR} . - cmake --build ${BUILD_DIR} -j${NPROC} --target install --config ${BUILD_TYPE} + cmake --build ${BUILD_DIR} -j${NPROC} --target install --config ${CMAKE_BUILD_TYPE} } cmake_install_executorch_libraries_for_android() { @@ -59,14 +59,14 @@ cmake_install_executorch_libraries_for_android() { ${EXECUTORCH_COMMON_CMAKE_ARGS} \ -B${BUILD_DIR} . - cmake --build ${BUILD_DIR} -j${NPROC} --target install --config ${BUILD_TYPE} + cmake --build ${BUILD_DIR} -j${NPROC} --target install --config ${CMAKE_BUILD_TYPE} } LLAVA_COMMON_CMAKE_ARGS=" \ -DPYTHON_EXECUTABLE="$PYTHON_EXECUTABLE" \ -DCMAKE_INSTALL_PREFIX=${BUILD_DIR} \ - -DCMAKE_BUILD_TYPE=${BUILD_TYPE} \ + -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE} \ -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \ -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \ -DEXECUTORCH_BUILD_XNNPACK=ON" @@ -81,7 +81,7 @@ cmake_build_llava_runner() { -B${BUILD_DIR}/${dir} \ ${dir} - cmake --build ${BUILD_DIR}/${dir} -j${NPROC} --config ${BUILD_TYPE} + cmake --build ${BUILD_DIR}/${dir} -j${NPROC} --config ${CMAKE_BUILD_TYPE} } @@ -98,7 +98,7 @@ cmake_build_llava_runner_for_android() { -B${BUILD_DIR}/${dir} \ ${dir} - cmake --build ${BUILD_DIR}/${dir} -j${NPROC} --config ${BUILD_TYPE} + cmake --build ${BUILD_DIR}/${dir} -j${NPROC} --config ${CMAKE_BUILD_TYPE} } # only export the one without custom op for now since it's diff --git a/.github/workflows/trunk.yml b/.github/workflows/trunk.yml index c5d33038e8..18c91691e9 100644 --- a/.github/workflows/trunk.yml +++ b/.github/workflows/trunk.yml @@ -290,7 +290,7 @@ jobs: # ${CONDA_RUN} python -m unittest examples.models.llava.test.test_llava # # run e2e (export, tokenizer and runner) - # PYTHON_EXECUTABLE=python ${CONDA_RUN} bash .ci/scripts/test_llava.sh Release + # PYTHON_EXECUTABLE=python ${CONDA_RUN} bash .ci/scripts/test_llava.sh test-qnn-model: name: test-qnn-model diff --git a/backends/xnnpack/third-party/XNNPACK b/backends/xnnpack/third-party/XNNPACK index d5d572e46e..4ea82e595b 160000 --- a/backends/xnnpack/third-party/XNNPACK +++ b/backends/xnnpack/third-party/XNNPACK @@ -1 +1 @@ -Subproject commit d5d572e46ed3929fa3e67f6174192893943cf724 +Subproject commit 4ea82e595b36106653175dcb04b2aa532660d0d8 diff --git a/backends/xnnpack/third-party/xnnpack.buck.bzl b/backends/xnnpack/third-party/xnnpack.buck.bzl index d2068661fe..6ce0316010 100644 --- a/backends/xnnpack/third-party/xnnpack.buck.bzl +++ b/backends/xnnpack/third-party/xnnpack.buck.bzl @@ -42,7 +42,7 @@ def define_xnnpack(): "XNNPACK/src/mutex.c", "XNNPACK/src/normalization.c", "XNNPACK/src/operator-utils.c", - "XNNPACK/src/packing.cc", + "XNNPACK/src/reference/packing.cc", ], headers = get_xnnpack_headers(), header_namespace = "", @@ -67,7 +67,7 @@ def define_xnnpack(): # @lint-ignore BUCKLINT: native and fb_native are explicitly forbidden in fbcode. native.cxx_library( name = "subgraph", - srcs = SUBGRAPH_SRCS, + srcs = SUBGRAPH_SRCS + ["XNNPACK/src/datatype.c"], compiler_flags = [ "-Wno-error=missing-braces", # required since the SGX toolchain does not have this by default ], @@ -1076,6 +1076,8 @@ def define_xnnpack(): "XNNPACK/src/configs/hardware-config.c", "XNNPACK/src/microparams-init.c", "XNNPACK/src/microkernel-utils.c", + "XNNPACK/src/reference/binary-elementwise.cc", + "XNNPACK/src/reference/unary-elementwise.cc", ], headers = get_xnnpack_headers(), exported_headers = { diff --git a/backends/xnnpack/third-party/xnnpack_src_defs.bzl b/backends/xnnpack/third-party/xnnpack_src_defs.bzl index 038b90acab..8cb9affede 100644 --- a/backends/xnnpack/third-party/xnnpack_src_defs.bzl +++ b/backends/xnnpack/third-party/xnnpack_src_defs.bzl @@ -17,24 +17,14 @@ def prod_srcs_for_arch_wrapper(arch): return define_xnnpack_build_src(prod_srcs) def get_xnnpack_headers(): - # XNNPACK Headers in the path containing xnnpack/ or configs/ - # do not contain the src/ path. However headers not in xnnpack/ or - # configs/ are prepend with the src/ path. This function helps us - # to correctly parse all the header files to the correct name src_headers = subdir_glob([ ("XNNPACK/src", "**/*.h"), ]) - fixed_headers = {} - for k, v in src_headers.items(): - new_key = k - if not k.startswith("xnnpack") and not k.startswith("configs"): - new_key = "src/{}".format(k) - fixed_headers[new_key] = v include_headers = subdir_glob([ ("XNNPACK/include", "*.h"), ]) - return fixed_headers | include_headers + return src_headers | include_headers OPERATOR_SRCS = define_xnnpack_build_src(_OPERATOR_SRCS) SUBGRAPH_SRCS = define_xnnpack_build_src(_SUBGRAPH_SRCS) From a8fa8574469e4aa06983b8695a7ded2182808d17 Mon Sep 17 00:00:00 2001 From: Erik Lundell Date: Fri, 15 Nov 2024 15:39:55 +0100 Subject: [PATCH 06/27] Add FVP testing to ops Add expected fails accordingly Signed-off-by: Erik Lundell Change-Id: Ic76626256ae4c53258536ffa747a7ee02832b168 --- backends/arm/test/ops/test_avg_pool.py | 13 +++++--- backends/arm/test/ops/test_bmm.py | 22 ++++++++++++-- backends/arm/test/ops/test_cat.py | 10 +++++- backends/arm/test/ops/test_clone.py | 5 ++- backends/arm/test/ops/test_conv1d.py | 8 ++++- backends/arm/test/ops/test_conv2d.py | 9 +++++- backends/arm/test/ops/test_conv_combos.py | 7 ++++- backends/arm/test/ops/test_depthwise_conv.py | 31 +++++++++++++++---- backends/arm/test/ops/test_div.py | 30 +++++++++++++++--- backends/arm/test/ops/test_exp.py | 7 +++-- backends/arm/test/ops/test_expand.py | 9 +++++- backends/arm/test/ops/test_full.py | 9 +++++- backends/arm/test/ops/test_hardtanh.py | 21 ++++++++++--- backends/arm/test/ops/test_layer_norm.py | 23 ++++++++++++-- backends/arm/test/ops/test_log.py | 5 ++- backends/arm/test/ops/test_mul.py | 32 ++++++++++++-------- backends/arm/test/ops/test_permute.py | 16 ++++++++-- backends/arm/test/ops/test_reciprocal.py | 19 ++++++------ backends/arm/test/ops/test_sub.py | 7 +++-- 19 files changed, 223 insertions(+), 60 deletions(-) diff --git a/backends/arm/test/ops/test_avg_pool.py b/backends/arm/test/ops/test_avg_pool.py index afd079fb95..ad3ddf8c0a 100644 --- a/backends/arm/test/ops/test_avg_pool.py +++ b/backends/arm/test/ops/test_avg_pool.py @@ -23,10 +23,10 @@ test_data_suite = [ # (test_name, test_data, [kernel_size, stride, padding]) - ("zeros", torch.zeros(20, 16, 50, 32), [4, 2, 0]), - ("ones", torch.zeros(20, 16, 50, 32), [4, 2, 0]), - ("rand", torch.rand(20, 16, 50, 32), [4, 2, 0]), - ("randn", torch.randn(20, 16, 50, 32), [4, 2, 0]), + ("zeros", torch.zeros(1, 16, 50, 32), [4, 2, 0]), + ("ones", torch.zeros(1, 16, 50, 32), [4, 2, 0]), + ("rand", torch.rand(1, 16, 50, 32), [4, 2, 0]), + ("randn", torch.randn(1, 16, 50, 32), [4, 2, 0]), ] @@ -101,7 +101,7 @@ def _test_avgpool2d_tosa_ethos_BI_pipeline( test_data: Tuple[torch.tensor], ): quantizer = ArmQuantizer().set_io(get_symmetric_quantization_config()) - ( + tester = ( ArmTester( module, example_inputs=test_data, @@ -116,7 +116,10 @@ def _test_avgpool2d_tosa_ethos_BI_pipeline( .check_not(["executorch_exir_dialects_edge__ops_aten_avg_pool2d_default"]) .check_count({"torch.ops.higher_order.executorch_call_delegate": 1}) .to_executorch() + .serialize() ) + if common.is_option_enabled("corstone300"): + tester.run_method_and_compare_outputs(qtol=1, inputs=test_data) @parameterized.expand(test_data_suite) def test_avgpool2d_tosa_MI( diff --git a/backends/arm/test/ops/test_bmm.py b/backends/arm/test/ops/test_bmm.py index 6246657120..824ec46372 100644 --- a/backends/arm/test/ops/test_bmm.py +++ b/backends/arm/test/ops/test_bmm.py @@ -41,7 +41,7 @@ def forward(self, x, y): class BMMSingleInput(torch.nn.Module): test_parameters = [ (torch.rand(20, 3, 3),), - (torch.ones(2, 128, 128),), + (torch.rand(2, 128, 128),), (10000 * torch.randn(4, 25, 25),), (5 + 5 * torch.randn(3, 64, 64),), ] @@ -96,7 +96,7 @@ def _test_bmm_ethosu_BI_pipeline( compile_spec: CompileSpec, test_data: Tuple[torch.Tensor, ...], ): - ( + tester = ( ArmTester( module, example_inputs=test_data, @@ -110,7 +110,10 @@ def _test_bmm_ethosu_BI_pipeline( .partition() .check_count({"torch.ops.higher_order.executorch_call_delegate": 1}) .to_executorch() + .serialize() ) + if common.is_option_enabled("corstone300"): + tester.run_method_and_compare_outputs(inputs=test_data, qtol=1) @parameterized.expand(BMM.test_parameters) def test_bmm_tosa_MI(self, operand1: torch.Tensor, operand2: torch.Tensor): @@ -143,9 +146,20 @@ def test_bmm_single_input_tosa_BI(self, operand1: torch.Tensor): self._test_bmm_tosa_BI_pipeline(self.BMMSingleInput(), test_data) @parameterized.expand(BMM.test_parameters) + @unittest.expectedFailure def test_bmm_u55_BI(self, operand1: torch.Tensor, operand2: torch.Tensor): test_data = (operand1, operand2) - self._test_bmm_tosa_BI_pipeline(self.BMM(), test_data) + self._test_bmm_ethosu_BI_pipeline( + self.BMM(), common.get_u55_compile_spec(), test_data + ) + + @parameterized.expand(BMM.test_parameters) + @common.expectedFailureOnFVP + def test_bmm_u85_BI(self, operand1: torch.Tensor, operand2: torch.Tensor): + test_data = (operand1, operand2) + self._test_bmm_ethosu_BI_pipeline( + self.BMM(), common.get_u85_compile_spec(), test_data + ) # Expected to fail with error: Warning, unsupported fusing of TOSA Rescale previous operator is of type: Memcpy @parameterized.expand(BMMSingleInput.test_parameters) @@ -156,7 +170,9 @@ def test_bmm_single_input_u55_BI(self, operand1: torch.Tensor): self.BMMSingleInput(), common.get_u55_compile_spec(), test_data ) + # Numerical issues on FVP, MLETORCH 534 @parameterized.expand(BMMSingleInput.test_parameters) + @common.expectedFailureOnFVP def test_bmm_single_input_u85_BI(self, operand1: torch.Tensor): test_data = (operand1,) self._test_bmm_ethosu_BI_pipeline( diff --git a/backends/arm/test/ops/test_cat.py b/backends/arm/test/ops/test_cat.py index b380c44d52..88846369d0 100644 --- a/backends/arm/test/ops/test_cat.py +++ b/backends/arm/test/ops/test_cat.py @@ -96,7 +96,7 @@ def _test_cat_ethosu_BI_pipeline( compile_spec: CompileSpec, test_data: Tuple[tuple[torch.Tensor, ...], int], ): - ( + tester = ( ArmTester( module, example_inputs=test_data, @@ -108,10 +108,14 @@ def _test_cat_ethosu_BI_pipeline( .check(["torch.ops.quantized_decomposed"]) .to_edge() .partition() + .dump_artifact() .check_not(["executorch_exir_dialects_edge__ops_aten_cat_default"]) .check_count({"torch.ops.higher_order.executorch_call_delegate": 1}) .to_executorch() + .serialize() ) + if common.is_option_enabled("corstone300"): + tester.run_method_and_compare_outputs(inputs=test_data) @parameterized.expand(Cat.test_parameters) def test_cat_tosa_MI(self, operands: tuple[torch.Tensor, ...], dim: int): @@ -129,14 +133,18 @@ def test_cat_tosa_BI(self, operands: tuple[torch.Tensor, ...], dim: int): test_data = (operands, dim) self._test_cat_tosa_BI_pipeline(self.Cat(), test_data) + # Mismatch in provided number of inputs and model signature, MLETORCH 519 @parameterized.expand(Cat.test_parameters) + @common.expectedFailureOnFVP def test_cat_u55_BI(self, operands: tuple[torch.Tensor, ...], dim: int): test_data = (operands, dim) self._test_cat_ethosu_BI_pipeline( self.Cat(), common.get_u55_compile_spec(), test_data ) + # Mismatch in provided number of inputs and model signature, MLETORCH 519 @parameterized.expand(Cat.test_parameters) + @common.expectedFailureOnFVP def test_cat_u85_BI(self, operands: tuple[torch.Tensor, ...], dim: int): test_data = (operands, dim) self._test_cat_ethosu_BI_pipeline( diff --git a/backends/arm/test/ops/test_clone.py b/backends/arm/test/ops/test_clone.py index 4721f257b0..6b5216a8e1 100644 --- a/backends/arm/test/ops/test_clone.py +++ b/backends/arm/test/ops/test_clone.py @@ -85,7 +85,7 @@ def _test_clone_tosa_ethos_pipeline( test_data: Tuple[torch.Tensor], ): quantizer = ArmQuantizer().set_io(get_symmetric_quantization_config()) - ( + tester = ( ArmTester(module, example_inputs=test_data, compile_spec=compile_spec) .quantize(Quantize(quantizer, get_symmetric_quantization_config())) .export() @@ -94,7 +94,10 @@ def _test_clone_tosa_ethos_pipeline( .partition() .check_count({"torch.ops.higher_order.executorch_call_delegate": 1}) .to_executorch() + .serialize() ) + if common.is_option_enabled("corstone300"): + tester.run_method_and_compare_outputs(qtol=1, inputs=test_data) def _test_clone_tosa_u55_pipeline( self, module: torch.nn.Module, test_data: Tuple[torch.Tensor] diff --git a/backends/arm/test/ops/test_conv1d.py b/backends/arm/test/ops/test_conv1d.py index 133148faef..f00c7984a1 100644 --- a/backends/arm/test/ops/test_conv1d.py +++ b/backends/arm/test/ops/test_conv1d.py @@ -268,7 +268,7 @@ def _test_conv1d_ethosu_BI_pipeline( compile_spec: CompileSpec, test_data: Tuple[torch.Tensor], ): - ( + tester = ( ArmTester(module, example_inputs=test_data, compile_spec=compile_spec) .quantize() .export() @@ -277,7 +277,10 @@ def _test_conv1d_ethosu_BI_pipeline( .check_count({"torch.ops.higher_order.executorch_call_delegate": 1}) .check_not(["executorch_exir_dialects_edge__ops_aten_convolution_default"]) .to_executorch() + .serialize() ) + if common.is_option_enabled("corstone300"): + tester.run_method_and_compare_outputs(qtol=1, inputs=test_data) @parameterized.expand(testsuite) def test_conv1d_tosa_MI(self, test_name, model): @@ -295,6 +298,9 @@ def test_conv1d_u55_BI(self, test_name, model): model, common.get_u55_compile_spec(), model.get_inputs() ) + # This specific test case has numerical errors on FVP, MLETORCH-520. + testsuite.remove(("5_3x2x128_st1", conv1d_5_3x2x128_st1)) + @parameterized.expand(testsuite) def test_conv1d_u85_BI(self, test_name, model): self._test_conv1d_ethosu_BI_pipeline( diff --git a/backends/arm/test/ops/test_conv2d.py b/backends/arm/test/ops/test_conv2d.py index 43c3e85139..21df4bf0d5 100644 --- a/backends/arm/test/ops/test_conv2d.py +++ b/backends/arm/test/ops/test_conv2d.py @@ -295,7 +295,7 @@ def _test_conv2d_ethosu_BI_pipeline( module: torch.nn.Module, test_data: Tuple[torch.Tensor], ): - ( + tester = ( ArmTester( module, example_inputs=test_data, @@ -308,7 +308,10 @@ def _test_conv2d_ethosu_BI_pipeline( .check_count({"torch.ops.higher_order.executorch_call_delegate": 1}) .check_not(["executorch_exir_dialects_edge__ops_aten_convolution_default"]) .to_executorch() + .serialize() ) + if common.is_option_enabled("corstone300"): + tester.run_method_and_compare_outputs(qtol=1, inputs=test_data) @parameterized.expand(testsuite) def test_conv2d_tosa_MI(self, test_name, model): @@ -318,6 +321,10 @@ def test_conv2d_tosa_MI(self, test_name, model): def test_conv2d_tosa_BI(self, test_name, model): self._test_conv2d_tosa_BI_pipeline(model, model.get_inputs()) + # These cases have numerical issues on FVP, MLETORCH-520 + testsuite.remove(("2x2_3x2x40x40_nobias", conv2d_2x2_3x2x40x40_nobias)) + testsuite.remove(("5x5_3x2x128x128_st1", conv2d_5x5_3x2x128x128_st1)) + @parameterized.expand(testsuite) def test_conv2d_u55_BI(self, test_name, model): self._test_conv2d_ethosu_BI_pipeline( diff --git a/backends/arm/test/ops/test_conv_combos.py b/backends/arm/test/ops/test_conv_combos.py index 3e9bdef958..7555fff720 100644 --- a/backends/arm/test/ops/test_conv_combos.py +++ b/backends/arm/test/ops/test_conv_combos.py @@ -238,7 +238,7 @@ def _test_conv_combo_ethos_BI_pipeline( compile_spec: CompileSpec, test_data: Tuple[torch.Tensor], ): - ( + tester = ( ArmTester( module, example_inputs=test_data, @@ -251,7 +251,10 @@ def _test_conv_combo_ethos_BI_pipeline( .check_count({"torch.ops.higher_order.executorch_call_delegate": 1}) .check_not(list(module.edge_op_list)) .to_executorch() + .serialize() ) + if common.is_option_enabled("corstone300"): + tester.run_method_and_compare_outputs(qtol=1, inputs=test_data) #################### ## Conv + meandim ## @@ -272,6 +275,8 @@ def test_conv_meandim_u55_BI(self): model.get_inputs(), ) + # Numerical Issues on FVP, MLETORCH-520 + @common.expectedFailureOnFVP def test_conv_meandim_u85_BI(self): model = ComboConv2dMeandim() self._test_conv_combo_ethos_BI_pipeline( diff --git a/backends/arm/test/ops/test_depthwise_conv.py b/backends/arm/test/ops/test_depthwise_conv.py index 4bfa863c49..28cb9ac844 100644 --- a/backends/arm/test/ops/test_depthwise_conv.py +++ b/backends/arm/test/ops/test_depthwise_conv.py @@ -8,8 +8,6 @@ from typing import Tuple -import pytest - import torch from executorch.backends.arm.test import common from executorch.backends.arm.test.ops.test_conv1d import Conv1d @@ -160,8 +158,8 @@ testsuite_conv1d = [ ("2_1x6x4_gp6_st1", dw_conv1d_2_1x6x4_gp6_st1), - ("3_1x3x256_gp3_st1", dw_conv1d_3_1x3x256_gp3_st1), ("two_dw_conv1d", two_dw_conv1d), + ("3_1x3x256_gp3_st1", dw_conv1d_3_1x3x256_gp3_st1), ("3_1x3x14_gp3_st1", dw_conv1d_3_1x3x14_gp3_st1), ] @@ -217,7 +215,7 @@ def _test_dw_conv_ethos_BI_pipeline( compile_spec: CompileSpec, test_data: Tuple[torch.Tensor], ): - ( + tester = ( ArmTester( module, example_inputs=test_data, @@ -230,7 +228,10 @@ def _test_dw_conv_ethos_BI_pipeline( .check_not(["executorch_exir_dialects_edge__ops_aten_convolution_default"]) .check_count({"torch.ops.higher_order.executorch_call_delegate": 1}) .to_executorch() + .serialize() ) + if common.is_option_enabled("corstone300"): + tester.run_method_and_compare_outputs(qtol=1, inputs=test_data) @parameterized.expand(testsuite_conv1d + testsuite_conv2d) def test_dw_conv_tosa_MI(self, test_name: str, model: torch.nn.Module): @@ -238,11 +239,15 @@ def test_dw_conv_tosa_MI(self, test_name: str, model: torch.nn.Module): # TODO: Investigate flakyness (MLTORCH-307) @parameterized.expand(testsuite_conv1d + testsuite_conv2d) - @pytest.mark.flaky(reruns=3) def test_dw_conv_tosa_BI(self, test_name: str, model: torch.nn.Module): self._test_dw_conv_tosa_BI_pipeline(model, model.get_inputs()) + testsuite_conv2d.remove( + ("3x3_1x3x256x256_gp3_st1", dw_conv2d_3x3_1x3x256x256_gp3_st1) + ) # Works + @parameterized.expand(testsuite_conv2d, skip_on_empty=True) + @common.expectedFailureOnFVP def test_dw_conv2d_u55_BI( self, test_name: str, model: torch.nn.Module, set_quantize_io: bool = False ): @@ -269,7 +274,21 @@ def test_dw_conv1d_u55_BI( model.get_inputs(), ) - @parameterized.expand(testsuite_conv1d + testsuite_conv2d) + # All test cases except 3x3_1x3x256x256_gp3_st1 have numerical issues on FVP. MLETORCH-520 + @parameterized.expand(testsuite_conv1d[:-2] + testsuite_conv2d) + @common.expectedFailureOnFVP + def test_dw_conv_u85_BI_xfails( + self, test_name: str, model: torch.nn.Module, set_quantize_io: bool = False + ): + self._test_dw_conv_ethos_BI_pipeline( + model, + common.get_u85_compile_spec( + permute_memory_to_nhwc=True, quantize_io=set_quantize_io + ), + model.get_inputs(), + ) + + @parameterized.expand(testsuite_conv1d[-2:]) def test_dw_conv_u85_BI( self, test_name: str, model: torch.nn.Module, set_quantize_io: bool = False ): diff --git a/backends/arm/test/ops/test_div.py b/backends/arm/test/ops/test_div.py index 28cc686690..b3815f3e7c 100644 --- a/backends/arm/test/ops/test_div.py +++ b/backends/arm/test/ops/test_div.py @@ -136,10 +136,10 @@ def _test_div_tosa_BI_pipeline( .run_method_and_compare_outputs(inputs=test_data, atol=1, rtol=0.1) ) - def _test_div_u55_BI_pipeline( - self, module: torch.nn.Module, test_data: Tuple[torch.Tensor] + def _test_div_ethos_BI_pipeline( + self, module: torch.nn.Module, compile_spec, test_data: Tuple[torch.Tensor] ): - ( + tester = ( ArmTester( module, example_inputs=test_data, @@ -155,7 +155,10 @@ def _test_div_u55_BI_pipeline( .partition() .check_count({"torch.ops.higher_order.executorch_call_delegate": 1}) .to_executorch() + .serialize() ) + if common.is_option_enabled("corstone300"): + tester.run_method_and_compare_outputs(qtol=1, inputs=test_data) @parameterized.expand(test_data_suite) def test_div_tosa_MI( @@ -180,7 +183,9 @@ def test_div_tosa_BI( test_data = (input_, other_) self._test_div_tosa_BI_pipeline(self.Div(), test_data) + # Numerical issues on FVP likely due to mul op, MLETORCH-521 @parameterized.expand(test_data_suite) + @common.expectedFailureOnFVP def test_div_u55_BI( self, test_name: str, @@ -189,4 +194,21 @@ def test_div_u55_BI( rounding_mode: Optional[str] = None, ): test_data = (input_, other_) - self._test_div_u55_BI_pipeline(self.Div(), test_data) + self._test_div_ethos_BI_pipeline( + self.Div(), common.get_u55_compile_spec(), test_data + ) + + # Numerical issues on FVP likely due to mul op, MLETORCH-521 + @parameterized.expand(test_data_suite) + @common.expectedFailureOnFVP + def test_div_u85_BI( + self, + test_name: str, + input_: Union[torch.Tensor, torch.types.Number], + other_: Union[torch.Tensor, torch.types.Number], + rounding_mode: Optional[str] = None, + ): + test_data = (input_, other_) + self._test_div_ethos_BI_pipeline( + self.Div(), common.get_u85_compile_spec(), test_data + ) diff --git a/backends/arm/test/ops/test_exp.py b/backends/arm/test/ops/test_exp.py index c706b7b206..f33e0a9058 100644 --- a/backends/arm/test/ops/test_exp.py +++ b/backends/arm/test/ops/test_exp.py @@ -20,7 +20,7 @@ ("zeros", torch.zeros(1, 10, 10, 10)), ("ones", torch.ones(10, 10, 10)), ("rand", torch.rand(10, 10) - 0.5), - ("randn_pos", torch.randn(10) + 10), + ("randn_pos", torch.randn(1, 4, 4, 4) + 10), ("randn_neg", torch.randn(10) - 10), ("ramp", torch.arange(-16, 16, 0.2)), ] @@ -78,7 +78,7 @@ def _test_exp_ethosu_BI_pipeline( module: torch.nn.Module, test_data: Tuple[torch.tensor], ): - ( + tester = ( ArmTester( module, example_inputs=test_data, @@ -93,7 +93,10 @@ def _test_exp_ethosu_BI_pipeline( .check_not(["executorch_exir_dialects_edge__ops_aten_exp_default"]) .check_count({"torch.ops.higher_order.executorch_call_delegate": 1}) .to_executorch() + .serialize() ) + if common.is_option_enabled("corstone300"): + tester.run_method_and_compare_outputs(qtol=1, inputs=test_data) @parameterized.expand(test_data_suite) def test_exp_tosa_MI( diff --git a/backends/arm/test/ops/test_expand.py b/backends/arm/test/ops/test_expand.py index effa7ce713..27f311b546 100644 --- a/backends/arm/test/ops/test_expand.py +++ b/backends/arm/test/ops/test_expand.py @@ -81,7 +81,7 @@ def _test_expand_ethosu_BI_pipeline( self, compile_spec: CompileSpec, module: torch.nn.Module, test_data: Tuple ): quantizer = ArmQuantizer().set_io(get_symmetric_quantization_config()) - ( + tester = ( ArmTester( module, example_inputs=test_data, @@ -95,7 +95,10 @@ def _test_expand_ethosu_BI_pipeline( .check_not(["torch.ops.aten.expand.default"]) .check_count({"torch.ops.higher_order.executorch_call_delegate": 1}) .to_executorch() + .serialize() ) + if common.is_option_enabled("corstone300"): + tester.run_method_and_compare_outputs(qtol=1, inputs=test_data) @parameterized.expand(Expand.test_parameters) def test_expand_tosa_MI(self, test_input, multiples): @@ -105,13 +108,17 @@ def test_expand_tosa_MI(self, test_input, multiples): def test_expand_tosa_BI(self, test_input, multiples): self._test_expand_tosa_BI_pipeline(self.Expand(), (test_input, multiples)) + # Mismatch in provided number of inputs and model signature, MLETORCH 519 @parameterized.expand(Expand.test_parameters) + @common.expectedFailureOnFVP def test_expand_u55_BI(self, test_input, multiples): self._test_expand_ethosu_BI_pipeline( common.get_u55_compile_spec(), self.Expand(), (test_input, multiples) ) + # Mismatch in provided number of inputs and model signature, MLETORCH 519 @parameterized.expand(Expand.test_parameters) + @common.expectedFailureOnFVP def test_expand_u85_BI(self, test_input, multiples): self._test_expand_ethosu_BI_pipeline( common.get_u85_compile_spec(), self.Expand(), (test_input, multiples) diff --git a/backends/arm/test/ops/test_full.py b/backends/arm/test/ops/test_full.py index d4cfc5c369..9857a7b87b 100644 --- a/backends/arm/test/ops/test_full.py +++ b/backends/arm/test/ops/test_full.py @@ -97,7 +97,7 @@ def _test_full_tosa_BI_pipeline( def _test_full_tosa_ethos_pipeline( self, compile_spec: list[CompileSpec], module: torch.nn.Module, test_data: Tuple ): - ( + tester = ( ArmTester(module, example_inputs=test_data, compile_spec=compile_spec) .quantize() .export() @@ -107,7 +107,10 @@ def _test_full_tosa_ethos_pipeline( .check_not(["executorch_exir_dialects_edge__ops_aten_full_default"]) .check_count({"torch.ops.higher_order.executorch_call_delegate": 1}) .to_executorch() + .serialize() ) + if common.is_option_enabled("corstone300"): + tester.run_method_and_compare_outputs(qtol=1, inputs=test_data) def _test_full_tosa_u55_pipeline(self, module: torch.nn.Module, test_data: Tuple): self._test_full_tosa_ethos_pipeline( @@ -140,14 +143,18 @@ def test_full_tosa_MI(self, test_tensor: Tuple): def test_full_tosa_BI(self, test_tensor: Tuple): self._test_full_tosa_BI_pipeline(self.AddVariableFull(), test_tensor, False) + # Mismatch in provided number of inputs and model signature, MLETORCH 519 @parameterized.expand(AddVariableFull.test_parameters) + @common.expectedFailureOnFVP def test_full_u55_BI(self, test_tensor: Tuple): self._test_full_tosa_u55_pipeline( self.AddVariableFull(), test_tensor, ) + # Mismatch in provided number of inputs and model signature, MLETORCH 519 @parameterized.expand(AddVariableFull.test_parameters) + @common.expectedFailureOnFVP def test_full_u85_BI(self, test_tensor: Tuple): self._test_full_tosa_u85_pipeline( self.AddVariableFull(), diff --git a/backends/arm/test/ops/test_hardtanh.py b/backends/arm/test/ops/test_hardtanh.py index a9f12abdf0..10073c5095 100644 --- a/backends/arm/test/ops/test_hardtanh.py +++ b/backends/arm/test/ops/test_hardtanh.py @@ -87,15 +87,15 @@ def _test_hardtanh_tosa_BI_pipeline( .run_method_and_compare_outputs(inputs=test_data) ) - def _test_hardtanh_tosa_u55_BI_pipeline( - self, module: torch.nn.Module, test_data: Tuple[torch.tensor] + def _test_hardtanh_tosa_ethosu_BI_pipeline( + self, compile_spec, module: torch.nn.Module, test_data: Tuple[torch.tensor] ): quantizer = ArmQuantizer().set_io(get_symmetric_quantization_config()) - ( + tester = ( ArmTester( module, example_inputs=test_data, - compile_spec=common.get_u55_compile_spec(), + compile_spec=compile_spec, ) .quantize(Quantize(quantizer, get_symmetric_quantization_config())) .export() @@ -106,7 +106,10 @@ def _test_hardtanh_tosa_u55_BI_pipeline( .check_not(["executorch_exir_dialects_edge__ops_aten_hardtanh_default"]) .check_count({"torch.ops.higher_order.executorch_call_delegate": 1}) .to_executorch() + .serialize() ) + if common.is_option_enabled("corstone300"): + tester.run_method_and_compare_outputs(qtol=1, inputs=test_data) @parameterized.expand(test_data_suite) def test_hardtanh_tosa_MI( @@ -122,4 +125,12 @@ def test_hardtanh_tosa_BI(self, test_name: str, test_data: torch.Tensor): @parameterized.expand(test_data_suite) def test_hardtanh_tosa_u55_BI(self, test_name: str, test_data: torch.Tensor): - self._test_hardtanh_tosa_u55_BI_pipeline(self.HardTanh(), (test_data,)) + self._test_hardtanh_tosa_ethosu_BI_pipeline( + common.get_u55_compile_spec(), self.HardTanh(), (test_data,) + ) + + @parameterized.expand(test_data_suite) + def test_hardtanh_tosa_u85_BI(self, test_name: str, test_data: torch.Tensor): + self._test_hardtanh_tosa_ethosu_BI_pipeline( + common.get_u85_compile_spec(), self.HardTanh(), (test_data,) + ) diff --git a/backends/arm/test/ops/test_layer_norm.py b/backends/arm/test/ops/test_layer_norm.py index f059d71eba..0b06044a59 100644 --- a/backends/arm/test/ops/test_layer_norm.py +++ b/backends/arm/test/ops/test_layer_norm.py @@ -115,7 +115,7 @@ def _test_layernorm_ethosu_BI_pipeline( compile_spec: CompileSpec, test_data: Tuple[torch.Tensor], ): - ( + tester = ( ArmTester( model=module, example_inputs=test_data, @@ -128,7 +128,10 @@ def _test_layernorm_ethosu_BI_pipeline( .partition() .check_count({"torch.ops.higher_order.executorch_call_delegate": 1}) .to_executorch() + .serialize() ) + if common.is_option_enabled("corstone300"): + tester.run_method_and_compare_outputs(qtol=1, inputs=test_data) @parameterized.expand(test_data_suite) def test_layer_norm_tosa_MI( @@ -152,8 +155,10 @@ def test_layer_norm_tosa_BI( self.LayerNorm(*model_params), (test_data,) ) + # Numerical issues on FVP likely due to mul op, MLETORCH-521 # Skip tests that require transposes. @parameterized.expand(test_data_suite[:-2]) + @common.expectedFailureOnFVP def test_layer_norm_u55_BI( self, test_name: str, @@ -164,7 +169,21 @@ def test_layer_norm_u55_BI( self.LayerNorm(*model_params), common.get_u55_compile_spec(), (test_data,) ) - @parameterized.expand(test_data_suite) + # Numerical issues on FVP likely due to mul op, MLETORCH-521 + @parameterized.expand(test_data_suite[:-1]) + @common.expectedFailureOnFVP + def test_layer_norm_u85_BI_fvp_xfails( + self, + test_name: str, + test_data: torch.Tensor, + model_params, + ): + self._test_layernorm_ethosu_BI_pipeline( + self.LayerNorm(*model_params), common.get_u85_compile_spec(), (test_data,) + ) + + @parameterized.expand(test_data_suite[-1:]) + @unittest.skip # Flaky def test_layer_norm_u85_BI( self, test_name: str, diff --git a/backends/arm/test/ops/test_log.py b/backends/arm/test/ops/test_log.py index 847635ea36..10175d27fb 100644 --- a/backends/arm/test/ops/test_log.py +++ b/backends/arm/test/ops/test_log.py @@ -78,7 +78,7 @@ def _test_log_ethosu_BI_pipeline( module: torch.nn.Module, test_data: Tuple[torch.tensor], ): - ( + tester = ( ArmTester( module, example_inputs=test_data, @@ -93,7 +93,10 @@ def _test_log_ethosu_BI_pipeline( .check_not(["executorch_exir_dialects_edge__ops_aten_log_default"]) .check_count({"torch.ops.higher_order.executorch_call_delegate": 1}) .to_executorch() + .serialize() ) + if common.is_option_enabled("corstone300"): + tester.run_method_and_compare_outputs(qtol=1, inputs=test_data) @parameterized.expand(test_data_suite) def test_log_tosa_MI( diff --git a/backends/arm/test/ops/test_mul.py b/backends/arm/test/ops/test_mul.py index 7fa20c2566..8f0321ea5f 100644 --- a/backends/arm/test/ops/test_mul.py +++ b/backends/arm/test/ops/test_mul.py @@ -16,9 +16,9 @@ test_data_sute = [ # (test_name, input, other,) See torch.mul() for info ( - "op_mul_rank1_ones", - torch.ones(5), - torch.ones(5), + "op_mul_rank1_rand", + torch.rand(5) * 3.7, + torch.rand(5) * 1.5, ), ( "op_mul_rank2_rand", @@ -32,23 +32,23 @@ ), ( "op_mul_rank4_randn", - torch.randn(5, 10, 25, 20), - torch.randn(5, 10, 25, 20), + torch.randn(1, 10, 25, 20), + torch.randn(1, 10, 25, 20), ), ( "op_mul_rank4_ones_mul_negative", torch.ones(1, 10, 25, 20), - (-1) * torch.ones(5, 10, 25, 20), + (-1) * torch.ones(1, 10, 25, 20), ), ( "op_mul_rank4_negative_large_rand", - (-200) * torch.rand(5, 10, 25, 20), - torch.rand(5, 1, 1, 20), + (-200) * torch.rand(1, 10, 25, 20), + torch.rand(1, 1, 1, 20), ), ( "op_mul_rank4_large_randn", - 200 * torch.randn(5, 10, 25, 20), - torch.rand(5, 10, 25, 1), + 200 * torch.randn(1, 10, 25, 20), + torch.rand(1, 10, 25, 1), ), ] @@ -112,7 +112,7 @@ def _test_mul_ethosu_BI_pipeline( module: torch.nn.Module, test_data: tuple[torch.Tensor, torch.Tensor], ): - ( + tester = ( ArmTester( module, example_inputs=test_data, @@ -126,7 +126,10 @@ def _test_mul_ethosu_BI_pipeline( .partition() .check_count({"torch.ops.higher_order.executorch_call_delegate": 1}) .to_executorch() + .serialize() ) + if common.is_option_enabled("corstone300"): + tester.run_method_and_compare_outputs(qtol=1, inputs=test_data) @parameterized.expand(test_data_sute) def test_mul_tosa_MI( @@ -149,7 +152,9 @@ def test_mul_tosa_BI( test_data = (input_, other_) self._test_mul_tosa_BI_pipeline(self.Mul(), test_data) + # Numerical issues on FVP, MLETORCH-521 @parameterized.expand(test_data_sute) + @common.expectedFailureOnFVP def test_mul_u55_BI( self, test_name: str, @@ -161,7 +166,10 @@ def test_mul_u55_BI( common.get_u55_compile_spec(), self.Mul(), test_data ) - @parameterized.expand(test_data_sute) + # Numerical issues on FVP, MLETORCH-521 + # test_data_sute[0] works on U85 + @parameterized.expand(test_data_sute[1:]) + @common.expectedFailureOnFVP def test_mul_u85_BI( self, test_name: str, diff --git a/backends/arm/test/ops/test_permute.py b/backends/arm/test/ops/test_permute.py index 62b6b823de..92400215b7 100644 --- a/backends/arm/test/ops/test_permute.py +++ b/backends/arm/test/ops/test_permute.py @@ -100,7 +100,7 @@ def _test_permute_ethos_BI_pipeline( test_data: Tuple[torch.Tensor], ): quantizer = ArmQuantizer().set_io(get_symmetric_quantization_config()) - ( + tester = ( ArmTester( module, example_inputs=test_data, @@ -117,6 +117,8 @@ def _test_permute_ethos_BI_pipeline( .to_executorch() .serialize() ) + if common.is_option_enabled("corstone300"): + tester.run_method_and_compare_outputs(qtol=1, inputs=test_data) @parameterized.expand(test_data_suite) def test_permute_tosa_MI( @@ -143,10 +145,20 @@ def test_permute_u55_BI( self.Permute(dims=dims), common.get_u55_compile_spec(), (test_data,) ) - @parameterized.expand(test_data_suite) + @parameterized.expand(test_data_suite[:-2]) def test_permute_u85_BI( self, test_name: str, test_data: torch.Tensor, dims: list[int] ): self._test_permute_ethos_BI_pipeline( self.Permute(dims=dims), common.get_u85_compile_spec(), (test_data,) ) + + # Fails since on FVP since N > 1 is not supported. MLETORCH-517 + @parameterized.expand(test_data_suite[-2:]) + @common.expectedFailureOnFVP + def test_permute_u85_BI_xfails( + self, test_name: str, test_data: torch.Tensor, dims: list[int] + ): + self._test_permute_ethos_BI_pipeline( + self.Permute(dims=dims), common.get_u85_compile_spec(), (test_data,) + ) diff --git a/backends/arm/test/ops/test_reciprocal.py b/backends/arm/test/ops/test_reciprocal.py index 7745a614e6..876f063c76 100644 --- a/backends/arm/test/ops/test_reciprocal.py +++ b/backends/arm/test/ops/test_reciprocal.py @@ -22,12 +22,12 @@ torch.rand(5) * 5, ), ("op_reciprocal_rank1_negative_ones", torch.ones(5) * (-1)), - ("op_reciprocal_rank4_ones", torch.ones(5, 10, 25, 20)), - ("op_reciprocal_rank4_negative_ones", (-1) * torch.ones(5, 10, 25, 20)), - ("op_reciprocal_rank4_ones_reciprocal_negative", torch.ones(5, 10, 25, 20)), - ("op_reciprocal_rank4_large_rand", 200 * torch.rand(5, 10, 25, 20)), - ("op_reciprocal_rank4_negative_large_rand", (-200) * torch.rand(5, 10, 25, 20)), - ("op_reciprocal_rank4_large_randn", 200 * torch.randn(5, 10, 25, 20) + 1), + ("op_reciprocal_rank4_ones", torch.ones(1, 10, 25, 20)), + ("op_reciprocal_rank4_negative_ones", (-1) * torch.ones(1, 10, 25, 20)), + ("op_reciprocal_rank4_ones_reciprocal_negative", torch.ones(1, 10, 25, 20)), + ("op_reciprocal_rank4_large_rand", 200 * torch.rand(1, 10, 25, 20)), + ("op_reciprocal_rank4_negative_large_rand", (-200) * torch.rand(1, 10, 25, 20)), + ("op_reciprocal_rank4_large_randn", 200 * torch.randn(1, 10, 25, 20) + 1), ] @@ -81,7 +81,7 @@ def _test_reciprocal_tosa_BI_pipeline( def _test_reciprocal_u55_BI_pipeline( self, module: torch.nn.Module, test_data: tuple[torch.Tensor] ): - ( + tester = ( ArmTester( module, example_inputs=test_data, @@ -95,15 +95,16 @@ def _test_reciprocal_u55_BI_pipeline( .partition() .check_count({"torch.ops.higher_order.executorch_call_delegate": 1}) .to_executorch() + .serialize() ) + if common.is_option_enabled("corstone300"): + tester.run_method_and_compare_outputs(qtol=1, inputs=test_data) @parameterized.expand(test_data_suite) def test_reciprocal_tosa_MI(self, test_name: str, input_: torch.Tensor): test_data = (input_,) self._test_reciprocal_tosa_MI_pipeline(self.Reciprocal(), test_data) - # Expected to fail since ArmQuantizer cannot quantize a Reciprocal layer - # TODO(MLETORCH-129) @parameterized.expand(test_data_suite) def test_reciprocal_tosa_BI(self, test_name: str, input_: torch.Tensor): diff --git a/backends/arm/test/ops/test_sub.py b/backends/arm/test/ops/test_sub.py index 5c67240e52..327a8de994 100644 --- a/backends/arm/test/ops/test_sub.py +++ b/backends/arm/test/ops/test_sub.py @@ -17,7 +17,7 @@ from parameterized import parameterized -class TestSimpleSub(unittest.TestCase): +class TestSub(unittest.TestCase): class Sub(torch.nn.Module): test_parameters = [ (torch.ones(5),), @@ -82,7 +82,7 @@ def _test_sub_ethosu_BI_pipeline( module: torch.nn.Module, test_data: Tuple[torch.Tensor], ): - ( + tester = ( ArmTester( module, example_inputs=test_data, @@ -96,7 +96,10 @@ def _test_sub_ethosu_BI_pipeline( .partition() .check_count({"torch.ops.higher_order.executorch_call_delegate": 1}) .to_executorch() + .serialize() ) + if common.is_option_enabled("corstone300"): + tester.run_method_and_compare_outputs(qtol=1, inputs=test_data) @parameterized.expand(Sub.test_parameters) def test_sub_tosa_MI(self, test_data: torch.Tensor): From f8bc7747cde15c7297f06d637991649112ef12c6 Mon Sep 17 00:00:00 2001 From: cad-audio <86048415+cad-audio@users.noreply.github.com> Date: Wed, 27 Nov 2024 07:57:45 -0800 Subject: [PATCH 07/27] HiFi optimizations for mean, where, min, max, pow, rem and quantized_linear operators. (#6867) * Adding mean and where ops optimized on HiFi * Adding quantized linear optimized versions for int8 and uint8 * adding pow, remainder, minimum, maximum operators (#33) * adding pow, remainder, minimum, maximum operators * adding pow, remainder, minimum, maximum operators * Fix for build issue faced in div_mod on old tools * Fix build failure due to merge issue * Fixing review comments on PR 6867 --------- Co-authored-by: dijopaul Co-authored-by: nishpoonia <94543206+nishpoonia@users.noreply.github.com> Co-authored-by: mcremon-meta <134334895+mcremon-meta@users.noreply.github.com> --- backends/cadence/aot/functions_hifi.yaml | 32 +- backends/cadence/hifi/kernels/CMakeLists.txt | 3 + backends/cadence/hifi/kernels/kernels.h | 42 + .../cadence/hifi/operators/CMakeLists.txt | 4 + .../cadence/hifi/operators/op_maximum.cpp | 175 +++ .../cadence/hifi/operators/op_minimum.cpp | 173 +++ backends/cadence/hifi/operators/op_pow.cpp | 354 +++++ backends/cadence/hifi/operators/op_rsqrt.cpp | 55 + .../hifi/operators/quantized_linear_out.cpp | 38 +- .../third-party/nnlib/xa_nn_broadcast_32.c | 313 +++++ .../third-party/nnlib/xa_nn_broadcast_32_32.c | 313 +++++ .../nnlib/xa_nn_elm_minimum_maximum_f32.c | 847 ++++++++++++ .../third-party/nnlib/xa_nn_elm_pow_f32.c | 1151 +++++++++++++++++ 13 files changed, 3478 insertions(+), 22 deletions(-) create mode 100644 backends/cadence/hifi/operators/op_maximum.cpp create mode 100644 backends/cadence/hifi/operators/op_minimum.cpp create mode 100644 backends/cadence/hifi/operators/op_pow.cpp create mode 100644 backends/cadence/hifi/operators/op_rsqrt.cpp create mode 100644 backends/cadence/hifi/third-party/nnlib/xa_nn_broadcast_32.c create mode 100644 backends/cadence/hifi/third-party/nnlib/xa_nn_broadcast_32_32.c create mode 100644 backends/cadence/hifi/third-party/nnlib/xa_nn_elm_minimum_maximum_f32.c create mode 100644 backends/cadence/hifi/third-party/nnlib/xa_nn_elm_pow_f32.c diff --git a/backends/cadence/aot/functions_hifi.yaml b/backends/cadence/aot/functions_hifi.yaml index cf234c22c0..b6a2c50001 100644 --- a/backends/cadence/aot/functions_hifi.yaml +++ b/backends/cadence/aot/functions_hifi.yaml @@ -77,10 +77,20 @@ - arg_meta: null kernel_name: torch::executor::max_pool2d_with_indices_out +- op: maximum.out + kernels: + - arg_meta: null + kernel_name: cadence::impl::HiFi::maximum_out + - op: mean.out kernels: - arg_meta: null - kernel_name: cadence::impl::HiFi::mean_dim_out + kernel_name: cadence::impl::HiFi::mean_dim_out + +- op: minimum.out + kernels: + - arg_meta: null + kernel_name: cadence::impl::HiFi::minimum_out - op: mul.out kernels: @@ -92,6 +102,26 @@ - arg_meta: null kernel_name: torch::executor::permute_copy_out +- op: pow.Scalar_out + kernels: + - arg_meta: null + kernel_name: cadence::impl::HiFi::pow_Scalar_out + +- op: pow.Tensor_Scalar_out + kernels: + - arg_meta: null + kernel_name: cadence::impl::HiFi::pow_Tensor_Scalar_out + +- op: pow.Tensor_Tensor_out + kernels: + - arg_meta: null + kernel_name: cadence::impl::HiFi::pow_Tensor_Tensor_out + +- op: rsqrt.out + kernels: + - arg_meta: null + kernel_name: cadence::impl::HiFi::rsqrt_out + - op: sigmoid.out kernels: - arg_meta: null diff --git a/backends/cadence/hifi/kernels/CMakeLists.txt b/backends/cadence/hifi/kernels/CMakeLists.txt index 9321cc544e..3d321443f8 100644 --- a/backends/cadence/hifi/kernels/CMakeLists.txt +++ b/backends/cadence/hifi/kernels/CMakeLists.txt @@ -9,10 +9,13 @@ add_library( cadence_kernels kernels.cpp ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/matmul_asym8uxasym8u_asym8u.cpp + ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_broadcast_32.c ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_add_f32_broadcast.c ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_div_f32_broadcast.c ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_div_mode_f32_broadcast.c + ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_minimum_maximum_f32.c ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_mul_f32_broadcast.c + ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_pow_f32.c ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_where_f32xf32_f32.c ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_reduce_32_32.c ) diff --git a/backends/cadence/hifi/kernels/kernels.h b/backends/cadence/hifi/kernels/kernels.h index 2c915661f8..10927adc2a 100644 --- a/backends/cadence/hifi/kernels/kernels.h +++ b/backends/cadence/hifi/kernels/kernels.h @@ -15,6 +15,14 @@ #include "xa_nnlib_kernels_api.h" /* Potential NNLIB function/APIs */ + +extern "C" WORD32 xa_nn_broadcast_32_32( + WORD32* __restrict__ p_out, + const int* const out_shape, + WORD32* __restrict__ p_in, + const int* const in_shape, + int num_dims); + extern "C" WORD32 xa_nn_elm_add_broadcast_4D_f32xf32_f32( FLOAT32* __restrict__ p_out, const WORD32* const p_out_shape, @@ -47,6 +55,34 @@ extern "C" WORD32 xa_nn_elm_div_mode_broadcast_4D_f32xf32_f32( const WORD32* const p_inp2_shape, WORD32 mode); +extern "C" WORD32 xa_nn_elm_maximum_f32xf32_f32( + FLOAT32* __restrict__ p_out, + const FLOAT32* __restrict__ p_inp1, + const FLOAT32* __restrict__ p_inp2, + WORD32 num_elm); + +extern "C" WORD32 xa_nn_elm_maximum_broadcast_4D_f32xf32_f32( + FLOAT32* __restrict__ p_out, + const WORD32* const p_out_shape, + const FLOAT32* __restrict__ p_inp1, + const WORD32* const p_inp1_shape, + const FLOAT32* __restrict__ p_inp2, + const WORD32* const p_inp2_shape); + +extern "C" WORD32 xa_nn_elm_minimum_f32xf32_f32( + FLOAT32* __restrict__ p_out, + const FLOAT32* __restrict__ p_inp1, + const FLOAT32* __restrict__ p_inp2, + WORD32 num_elm); + +extern "C" WORD32 xa_nn_elm_minimum_broadcast_4D_f32xf32_f32( + FLOAT32* __restrict__ p_out, + const WORD32* const p_out_shape, + const FLOAT32* __restrict__ p_inp1, + const WORD32* const p_inp1_shape, + const FLOAT32* __restrict__ p_inp2, + const WORD32* const p_inp2_shape); + extern "C" WORD32 xa_nn_elm_mul_broadcast_4D_f32xf32_f32( FLOAT32* __restrict__ p_out, const WORD32* const p_out_shape, @@ -55,6 +91,12 @@ extern "C" WORD32 xa_nn_elm_mul_broadcast_4D_f32xf32_f32( const FLOAT32* __restrict__ p_inp2, const WORD32* const p_inp2_shape); +extern "C" void xa_nn_elm_pow_f32( + FLOAT32* restrict z, + const FLOAT32* restrict x, + const FLOAT32* restrict y, + WORD32 N); + extern "C" WORD32 xa_nn_elm_where_f32xf32_f32( FLOAT32* __restrict__ p_out, const FLOAT32* __restrict__ p_inp1, diff --git a/backends/cadence/hifi/operators/CMakeLists.txt b/backends/cadence/hifi/operators/CMakeLists.txt index fc00345465..5e51f7fd3b 100644 --- a/backends/cadence/hifi/operators/CMakeLists.txt +++ b/backends/cadence/hifi/operators/CMakeLists.txt @@ -22,8 +22,12 @@ endif() set(_aten_ops__srcs "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_add.cpp" "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_div.cpp" + "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_maximum.cpp" "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_mean.cpp" + "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_minimum.cpp" "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_mul.cpp" + "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_pow.cpp" + "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_rsqrt.cpp" "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_sigmoid.cpp" "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_sub.cpp" "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_tanh.cpp" diff --git a/backends/cadence/hifi/operators/op_maximum.cpp b/backends/cadence/hifi/operators/op_maximum.cpp new file mode 100644 index 0000000000..f9a3658891 --- /dev/null +++ b/backends/cadence/hifi/operators/op_maximum.cpp @@ -0,0 +1,175 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include +#include +#include +#include + +using exec_aten::ScalarType; +using exec_aten::Tensor; +using executorch::aten::RuntimeContext; +using executorch::runtime::can_cast; +using executorch::runtime::canCast; +using executorch::runtime::CppTypeToScalarType; +using executorch::runtime::promoteTypes; +using torch::executor::apply_binary_elementwise_fn; +using torch::executor::Error; +using torch::executor::resize_to_broadcast_target_size; + + +namespace cadence { +namespace impl { +namespace HiFi { +namespace native { +namespace { + +template < + bool can_cast, + typename CTYPE_A, + typename CTYPE_B, + typename CTYPE_IN, + typename CTYPE_OUT> +struct MaximumInner; + +template < + typename CTYPE_A, + typename CTYPE_B, + typename CTYPE_IN, + typename CTYPE_OUT> +struct MaximumInner { + static void run(const Tensor& a, const Tensor& b, Tensor& out) { + apply_binary_elementwise_fn( + // NOLINTNEXTLINE(facebook-hte-ConstantArgumentPassByValue) + [](const CTYPE_A val_a, const CTYPE_B val_b) { + CTYPE_IN a_casted = static_cast(val_a); + CTYPE_IN b_casted = static_cast(val_b); + CTYPE_IN value = + torch::executor::native::utils::max_override(a_casted, b_casted); + + return static_cast(value); + }, + a, + b, + out); + } +}; + +struct ReportCanCastBug { + static void run(const Tensor&, const Tensor&, Tensor&) { + ET_DCHECK_MSG(false, "BUG: canCast should have been checked above"); + } +}; + +template < + typename CTYPE_A, + typename CTYPE_B, + typename CTYPE_IN, + typename CTYPE_OUT> +struct MaximumInner + : public ReportCanCastBug {}; + +} // namespace + +Tensor& maximum_out( + RuntimeContext& ctx, + const Tensor& a, + const Tensor& b, + Tensor& out) { + (void)ctx; + + ET_KERNEL_CHECK( + ctx, + resize_to_broadcast_target_size(a, b, out) == Error::Ok, + InvalidArgument, + out); + + constexpr int kNnlibMaxDim = 4; /*fallback if broadcast and dim > 4 */ + + ScalarType a_type = a.scalar_type(); + ScalarType b_type = b.scalar_type(); + ScalarType common_type = promoteTypes(a_type, b_type, /*half_to_float*/ true); + ScalarType out_type = out.scalar_type(); + + ET_KERNEL_CHECK(ctx, canCast(common_type, out_type), InvalidArgument, out); + + bool optimized = true; + /*find broadcast*/ + bool a_is_broadcasted = !out.sizes().equals(a.sizes()); + bool b_is_broadcasted = !out.sizes().equals(b.sizes()); + bool broadcast = (a_is_broadcasted || b_is_broadcasted); + + int max_dim = a.dim() > b.dim() ? a.dim() : b.dim(); + max_dim = out.dim() > max_dim ? out.dim() : max_dim; + + if ((a_type != ScalarType::Float) || (b_type != ScalarType::Float)) + optimized = false; + if ((broadcast == true) && (max_dim > kNnlibMaxDim)) + optimized = false; + + if (optimized) { + float* a_data = a.mutable_data_ptr(); + float* b_data = b.mutable_data_ptr(); + float* out_data = out.mutable_data_ptr(); + + if (broadcast == true) { + int out_shape[kNnlibMaxDim]; + int inp1_shape[kNnlibMaxDim]; + int inp2_shape[kNnlibMaxDim]; + + for (int i = 0; i < kNnlibMaxDim; i++) { + out_shape[i] = 1; + inp1_shape[i] = 1; + inp2_shape[i] = 1; + } + + int off_o = kNnlibMaxDim - out.dim(); + int off_a = kNnlibMaxDim - a.dim(); + int off_b = kNnlibMaxDim - b.dim(); + + for (int i = 0; i < out.dim(); i++) { + out_shape[i + off_o] = out.size(i); + } + + for (int i = 0; i < a.dim(); i++) + inp1_shape[i + off_a] = a.size(i); + + for (int i = 0; i < b.dim(); i++) + inp2_shape[i + off_b] = b.size(i); + + xa_nn_elm_maximum_broadcast_4D_f32xf32_f32( + out_data, out_shape, a_data, inp1_shape, b_data, inp2_shape); + } else { + xa_nn_elm_maximum_f32xf32_f32(out_data, a_data, b_data, out.numel()); + } + return out; + } + ET_SWITCH_REALHB_TYPES(a_type, ctx, "maximum.out", CTYPE_A, [&]() { + ET_SWITCH_REALHB_TYPES(b_type, ctx, "maximum.out", CTYPE_B, [&]() { + using CTYPE_IN = typename torch::executor:: + promote_types::type; + ET_DCHECK(CppTypeToScalarType::value == common_type); + ET_SWITCH_REALHB_TYPES(out_type, ctx, "maximum.out", CTYPE_OUT, [&]() { + MaximumInner< + can_cast::value, + CTYPE_A, + CTYPE_B, + CTYPE_IN, + CTYPE_OUT>::run(a, b, out); + }); + }); + }); + + return out; +} + +} // namespace native +} // namespace HiFi +} // namespace impl +} // namespace cadence diff --git a/backends/cadence/hifi/operators/op_minimum.cpp b/backends/cadence/hifi/operators/op_minimum.cpp new file mode 100644 index 0000000000..6f81ad5c3e --- /dev/null +++ b/backends/cadence/hifi/operators/op_minimum.cpp @@ -0,0 +1,173 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include +#include +#include +#include + +using exec_aten::ScalarType; +using exec_aten::Tensor; +using executorch::aten::RuntimeContext; +using executorch::runtime::can_cast; +using executorch::runtime::canCast; +using executorch::runtime::CppTypeToScalarType; +using executorch::runtime::promoteTypes; +using torch::executor::apply_binary_elementwise_fn; +using torch::executor::Error; +using torch::executor::resize_to_broadcast_target_size; + +namespace cadence { +namespace impl { +namespace HiFi { +namespace native { +namespace { + +template < + bool can_cast, + typename CTYPE_A, + typename CTYPE_B, + typename CTYPE_IN, + typename CTYPE_OUT> +struct MinimumInner; + +template < + typename CTYPE_A, + typename CTYPE_B, + typename CTYPE_IN, + typename CTYPE_OUT> +struct MinimumInner { + static void run(const Tensor& a, const Tensor& b, Tensor& out) { + apply_binary_elementwise_fn( + // NOLINTNEXTLINE(facebook-hte-ConstantArgumentPassByValue) + [](const CTYPE_A val_a, const CTYPE_B val_b) { + CTYPE_IN a_casted = static_cast(val_a); + CTYPE_IN b_casted = static_cast(val_b); + CTYPE_IN value = + torch::executor::native::utils::min_override(a_casted, b_casted); + + return static_cast(value); + }, + a, + b, + out); + } +}; + +struct ReportCanCastBug { + static void run(const Tensor&, const Tensor&, Tensor&) { + ET_DCHECK_MSG(false, "BUG: canCast should have been checked above"); + } +}; + +template < + typename CTYPE_A, + typename CTYPE_B, + typename CTYPE_IN, + typename CTYPE_OUT> +struct MinimumInner + : public ReportCanCastBug {}; + +} // namespace + +Tensor& minimum_out( + RuntimeContext& ctx, + const Tensor& a, + const Tensor& b, + Tensor& out) { + (void)ctx; + + ET_KERNEL_CHECK( + ctx, + resize_to_broadcast_target_size(a, b, out) == Error::Ok, + InvalidArgument, + out); + + constexpr int kNnlibMaxDim = 4; /*fallback if broadcast and dim > 4 */ + + ScalarType a_type = a.scalar_type(); + ScalarType b_type = b.scalar_type(); + ScalarType common_type = promoteTypes(a_type, b_type, /*half_to_float*/ true); + ScalarType out_type = out.scalar_type(); + + ET_KERNEL_CHECK(ctx, canCast(common_type, out_type), InvalidArgument, out); + + bool optimized = true; + /*find broadcast*/ + const bool a_is_broadcasted = !out.sizes().equals(a.sizes()); + const bool b_is_broadcasted = !out.sizes().equals(b.sizes()); + const bool broadcast = (a_is_broadcasted || b_is_broadcasted); + + int max_dim = a.dim() > b.dim() ? a.dim() : b.dim(); + max_dim = out.dim() > max_dim ? out.dim() : max_dim; + + if ((a_type != ScalarType::Float) || (b_type != ScalarType::Float)) + optimized = false; + if ((broadcast == true) && (max_dim > kNnlibMaxDim)) + optimized = false; + + if (optimized) { + float* a_data = a.mutable_data_ptr(); + float* b_data = b.mutable_data_ptr(); + float* out_data = out.mutable_data_ptr(); + + if (broadcast == true) { + int out_shape[kNnlibMaxDim]; + int inp1_shape[kNnlibMaxDim]; + int inp2_shape[kNnlibMaxDim]; + + for (int i = 0; i < kNnlibMaxDim; i++) { + out_shape[i] = 1; + inp1_shape[i] = 1; + inp2_shape[i] = 1; + } + + int off_o = kNnlibMaxDim - out.dim(); + int off_a = kNnlibMaxDim - a.dim(); + int off_b = kNnlibMaxDim - b.dim(); + + for (int i = 0; i < out.dim(); i++) { + out_shape[i + off_o] = out.size(i); + } + + for (int i = 0; i < a.dim(); i++) + inp1_shape[i + off_a] = a.size(i); + + for (int i = 0; i < b.dim(); i++) + inp2_shape[i + off_b] = b.size(i); + + xa_nn_elm_minimum_broadcast_4D_f32xf32_f32( + out_data, out_shape, a_data, inp1_shape, b_data, inp2_shape); + } else { + xa_nn_elm_minimum_f32xf32_f32(out_data, a_data, b_data, out.numel()); + } + return out; + } + ET_SWITCH_REALHB_TYPES(a_type, ctx, "minimum.out", CTYPE_A, [&]() { + ET_SWITCH_REALHB_TYPES(b_type, ctx, "minimum.out", CTYPE_B, [&]() { + using CTYPE_IN = typename torch::executor:: + promote_types::type; + ET_DCHECK(CppTypeToScalarType::value == common_type); + ET_SWITCH_REALHB_TYPES(out_type, ctx, "minimum.out", CTYPE_OUT, [&]() { + MinimumInner< + can_cast::value, + CTYPE_A, + CTYPE_B, + CTYPE_IN, + CTYPE_OUT>::run(a, b, out); + }); + }); + }); + return out; +} + +} // namespace native +} // namespace HiFi +} // namespace impl +} // namespace cadence diff --git a/backends/cadence/hifi/operators/op_pow.cpp b/backends/cadence/hifi/operators/op_pow.cpp new file mode 100644 index 0000000000..9669e96123 --- /dev/null +++ b/backends/cadence/hifi/operators/op_pow.cpp @@ -0,0 +1,354 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include + +#include +#include +#include +#include +#include +#include + +using exec_aten::Scalar; +using exec_aten::ScalarType; +using exec_aten::Tensor; +using executorch::runtime::can_cast; +using executorch::runtime::canCast; +using executorch::runtime::CppTypeToScalarType; +using executorch::runtime::KernelRuntimeContext; +using executorch::runtime::promoteTypes; +using torch::executor::Error; +using torch::executor::resize_to_broadcast_target_size; + +namespace cadence { +namespace impl { +namespace HiFi { +namespace native { + +namespace { +template < + bool can_cast, + typename CTYPE_A, + typename CTYPE_B, + typename CTYPE_IN, + typename CTYPE_OUT> +struct PowInner; + +template < + typename CTYPE_A, + typename CTYPE_B, + typename CTYPE_IN, + typename CTYPE_OUT> +struct PowInner { + static void run(const Tensor& a, const Tensor& b, Tensor& out) { + torch::executor::apply_binary_elementwise_fn( + // NOLINTNEXTLINE(facebook-hte-ConstantArgumentPassByValue) + [](const CTYPE_A val_a, const CTYPE_B val_b) { + CTYPE_IN a_casted = static_cast(val_a); + CTYPE_IN b_casted = static_cast(val_b); + CTYPE_IN value = std::pow(a_casted, b_casted); + return static_cast(value); + }, + a, + b, + out); + } +}; + +struct ReportCanCastBug { + static void run(const Tensor&, const Tensor&, Tensor&) { + ET_DCHECK_MSG(false, "BUG: canCast should have been checked above"); + } +}; + +template < + typename CTYPE_A, + typename CTYPE_B, + typename CTYPE_IN, + typename CTYPE_OUT> +struct PowInner + : public ReportCanCastBug {}; + +} // namespace + +Tensor& pow_Tensor_Tensor_out( + KernelRuntimeContext& ctx, + const Tensor& a, + const Tensor& b, + Tensor& out) { + // Determine output size and resize for dynamic shapes + ET_KERNEL_CHECK( + ctx, + resize_to_broadcast_target_size(a, b, out) == Error::Ok, + InvalidArgument, + out); + + ScalarType a_type = a.scalar_type(); + ScalarType b_type = b.scalar_type(); + ScalarType common_type = promoteTypes(a_type, b_type, /*half_to_float*/ true); + ScalarType out_type = out.scalar_type(); + + ET_KERNEL_CHECK( + ctx, common_type != exec_aten::ScalarType::Bool, InvalidArgument, out); + ET_KERNEL_CHECK(ctx, canCast(common_type, out_type), InvalidArgument, out); + + constexpr auto name = "pow.Tensor_Tensor_out"; + constexpr int kNnlibMaxDim = 16; + int a_dim = a.dim(), b_dim = b.dim(), out_dim = out.dim(); + bool optimized = true; + + const bool a_is_broadcasted = !out.sizes().equals(a.sizes()); + const bool b_is_broadcasted = !out.sizes().equals(b.sizes()); + const bool broadcast = (a_is_broadcasted && b_is_broadcasted); + int max_dim = a.dim() > b.dim() ? a.dim() : b.dim(); + max_dim = out.dim() > max_dim ? out.dim() : max_dim; + + if (out_type != ScalarType::Float) + optimized = false; + + if (max_dim > kNnlibMaxDim) + optimized = false; + + WORD32 num_elm = out.numel(); + + if (optimized) { + if (broadcast) { + WORD32* __restrict__ ptr1 = + (WORD32* __restrict__)malloc(num_elm * sizeof(WORD32)); + WORD32* __restrict__ ptr2 = + (WORD32* __restrict__)malloc(num_elm * sizeof(WORD32)); + + WORD32* __restrict__ pin1 = + (WORD32* __restrict__)a.const_data_ptr(); + WORD32* __restrict__ pin2 = + (WORD32* __restrict__)b.const_data_ptr(); + + WORD32 p_out_shape[kNnlibMaxDim]; + WORD32 p_inp1_shape[kNnlibMaxDim]; + WORD32 p_inp2_shape[kNnlibMaxDim]; + + for (int i = 0; i < out_dim; i++) + p_out_shape[i] = out.size(i); + for (int i = 0; i < a_dim; i++) + p_inp1_shape[i] = a.size(i); + for (int i = 0; i < b_dim; i++) + p_inp2_shape[i] = b.size(i); + + xa_nn_broadcast_32_32(ptr1, p_out_shape, pin1, p_inp1_shape, out_dim); + + xa_nn_broadcast_32_32(ptr2, p_out_shape, pin2, p_inp2_shape, out_dim); + + FLOAT32* __restrict__ p_out = + (FLOAT32* __restrict__)out.mutable_data_ptr(); + const FLOAT32* __restrict__ p_inp1 = (const FLOAT32* __restrict__)ptr1; + const FLOAT32* __restrict__ p_inp2 = (const FLOAT32* __restrict__)ptr2; + + xa_nn_elm_pow_f32(p_out, p_inp1, p_inp2, num_elm); + + free(ptr1); + free(ptr2); + } else if (a_is_broadcasted && (!b_is_broadcasted)) { + FLOAT32* __restrict__ ptr1 = + (FLOAT32* __restrict__)malloc((num_elm + 2) * sizeof(WORD32)); + + FLOAT32* __restrict__ pin1 = + (FLOAT32* __restrict__)a.const_data_ptr(); + + WORD32 p_out_shape[kNnlibMaxDim]; + WORD32 p_inp1_shape[kNnlibMaxDim]; + + for (int i = 0; i < out_dim; i++) + p_out_shape[i] = out.size(i); + for (int i = 0; i < a_dim; i++) + p_inp1_shape[i] = a.size(i); + + xa_nn_broadcast_32_32( + (WORD32*)ptr1, p_out_shape, (WORD32*)pin1, p_inp1_shape, out_dim); + + FLOAT32* __restrict__ p_out = + (FLOAT32* __restrict__)out.mutable_data_ptr(); + const FLOAT32* __restrict__ p_inp1 = (const FLOAT32* __restrict__)ptr1; + const FLOAT32* __restrict__ p_inp2 = + (const FLOAT32* __restrict__)b.const_data_ptr(); + + xa_nn_elm_pow_f32(p_out, p_inp1, p_inp2, num_elm); + + free(ptr1); + } else if (b_is_broadcasted && (!a_is_broadcasted)) { + WORD32* __restrict__ ptr1 = + (WORD32* __restrict__)malloc(num_elm * sizeof(WORD32)); + + WORD32* __restrict__ pin1 = + (WORD32* __restrict__)b.const_data_ptr(); + + WORD32 p_out_shape[kNnlibMaxDim]; + WORD32 p_inp1_shape[kNnlibMaxDim]; + + for (int i = 0; i < out_dim; i++) + p_out_shape[i] = out.size(i); + for (int i = 0; i < b_dim; i++) + p_inp1_shape[i] = b.size(i); + + xa_nn_broadcast_32_32(ptr1, p_out_shape, pin1, p_inp1_shape, out_dim); + + FLOAT32* __restrict__ p_out = + (FLOAT32* __restrict__)out.mutable_data_ptr(); + const FLOAT32* __restrict__ p_inp1 = + (const FLOAT32* __restrict__)a.const_data_ptr(); + const FLOAT32* __restrict__ p_inp2 = (const FLOAT32* __restrict__)ptr1; + + xa_nn_elm_pow_f32(p_out, p_inp1, p_inp2, num_elm); + + free(ptr1); + } else { + FLOAT32* __restrict__ p_out = + (FLOAT32* __restrict__)out.mutable_data_ptr(); + const FLOAT32* __restrict__ p_inp1 = + (const FLOAT32* __restrict__)a.const_data_ptr(); + const FLOAT32* __restrict__ p_inp2 = + (const FLOAT32* __restrict__)b.const_data_ptr(); + + xa_nn_elm_pow_f32(p_out, p_inp1, p_inp2, num_elm); + } + return out; + } + + ET_SWITCH_REALHB_TYPES(a_type, ctx, name, CTYPE_A, [&]() { + ET_SWITCH_REALHB_TYPES(b_type, ctx, name, CTYPE_B, [&]() { + using CTYPE_IN = typename torch::executor:: + promote_types::type; + ET_DCHECK(CppTypeToScalarType::value == common_type); + ET_SWITCH_REALH_TYPES(out_type, ctx, name, CTYPE_OUT, [&]() { + PowInner< + !std::is_same::value && + can_cast::value, + CTYPE_A, + CTYPE_B, + CTYPE_IN, + CTYPE_OUT>::run(a, b, out); + }); + }); + }); + + return out; +} + +Tensor& pow_Tensor_Scalar_out( + KernelRuntimeContext& ctx, + const Tensor& a, + const Scalar& b, + Tensor& out) { + (void)ctx; + + // Resize for dynamic shape + ET_KERNEL_CHECK_MSG( + ctx, + resize_tensor(out, a.sizes()) == Error::Ok, + InvalidArgument, + out, + "Failed to resize output tensor."); + + ScalarType a_type = a.scalar_type(); + ScalarType b_type = torch::executor::native::utils::get_scalar_dtype(b); + ScalarType common_type = + torch::executor::native::utils::promote_type_with_scalar( + a_type, b, /*half_to_float*/ false); + ScalarType out_type = out.scalar_type(); + + ET_KERNEL_CHECK(ctx, common_type == out_type, InvalidArgument, out); + + constexpr auto name = "pow.Tensor_Scalar_out"; + if (common_type == ScalarType::Half) { + common_type = ScalarType::Float; + } + + ET_SWITCH_REALHB_TYPES(a_type, ctx, name, CTYPE_A, [&]() { + ET_SWITCH_SCALAR_OBJ_TYPES(b_type, ctx, name, CTYPE_B, [&]() { + ET_SWITCH_REAL_TYPES(common_type, ctx, name, CTYPE_IN, [&]() { + ET_SWITCH_REALH_TYPES(out_type, ctx, name, CTYPE_OUT, [&]() { + CTYPE_B val_b = 0; + torch::executor::native::utils::extract_scalar(b, &val_b); + torch::executor::apply_unary_map_fn( + [val_b](const CTYPE_A val_a) { + CTYPE_IN a_casted = static_cast(val_a); + CTYPE_IN b_casted = static_cast(val_b); + CTYPE_IN value = std::pow(a_casted, b_casted); + + return static_cast(value); + }, + a.const_data_ptr(), + out.mutable_data_ptr(), + out.numel()); + }); + }); + }); + }); + + return out; +} + +Tensor& pow_Scalar_out( + KernelRuntimeContext& ctx, + const Scalar& a, + const Tensor& b, + Tensor& out) { + (void)ctx; + + // Resize for dynamic shape + ET_KERNEL_CHECK_MSG( + ctx, + resize_tensor(out, b.sizes()) == Error::Ok, + InvalidArgument, + out, + "Failed to resize output tensor."); + + ScalarType a_type = torch::executor::native::utils::get_scalar_dtype(a); + ScalarType b_type = b.scalar_type(); + ScalarType common_type = + torch::executor::native::utils::promote_type_with_scalar( + b_type, a, /*half_to_float*/ false); + ScalarType out_type = out.scalar_type(); + + ET_KERNEL_CHECK(ctx, common_type == out_type, InvalidArgument, out); + + constexpr auto name = "pow.Scalar_out"; + if (common_type == ScalarType::Half) { + common_type = ScalarType::Float; + } + + ET_SWITCH_SCALAR_OBJ_TYPES(a_type, ctx, name, CTYPE_A, [&]() { + ET_SWITCH_REALHB_TYPES(b_type, ctx, name, CTYPE_B, [&]() { + ET_SWITCH_REAL_TYPES(common_type, ctx, name, CTYPE_IN, [&]() { + ET_SWITCH_REALH_TYPES(out_type, ctx, name, CTYPE_OUT, [&]() { + CTYPE_A val_a = 0; + torch::executor::native::utils::extract_scalar(a, &val_a); + + torch::executor::apply_unary_map_fn( + [val_a](const CTYPE_B val_b) { + CTYPE_IN a_casted = static_cast(val_a); + CTYPE_IN b_casted = static_cast(val_b); + CTYPE_IN value = std::pow(a_casted, b_casted); + return static_cast(value); + }, + b.const_data_ptr(), + out.mutable_data_ptr(), + out.numel()); + }); + }); + }); + }); + + return out; +} + +} // namespace native +} // namespace HiFi +} // namespace impl +} // namespace cadence + diff --git a/backends/cadence/hifi/operators/op_rsqrt.cpp b/backends/cadence/hifi/operators/op_rsqrt.cpp new file mode 100644 index 0000000000..1cf717988a --- /dev/null +++ b/backends/cadence/hifi/operators/op_rsqrt.cpp @@ -0,0 +1,55 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include + +#include + +using exec_aten::ScalarType; +using exec_aten::Tensor; +using executorch::aten::RuntimeContext; + +namespace cadence { +namespace impl { +namespace HiFi { +namespace native { +namespace { + +double rsqrt(double x) { + return 1.0 / std::sqrt(x); +} + +} // namespace + +Tensor& rsqrt_out(RuntimeContext& ctx, const Tensor& in, Tensor& out) { + bool optimized = true; + + if (out.scalar_type() != ScalarType::Float) + optimized = false; + + if (optimized) { + WORD32 num_elm = out.numel(); + + FLOAT32* __restrict__ p_out = + (FLOAT32* __restrict__)out.mutable_data_ptr(); + const FLOAT32* __restrict__ p_inp = + (const FLOAT32* __restrict__)in.const_data_ptr(); + + xa_nn_elm_rsqrt_f32_f32(p_out, p_inp, num_elm); + return out; + } + + return torch::executor::native::internal:: + unary_ufunc_realhbbf16_to_floathbf16(rsqrt, ctx, in, out); +} + +} // namespace native +} // namespace HiFi +} // namespace impl +} // namespace cadence diff --git a/backends/cadence/hifi/operators/quantized_linear_out.cpp b/backends/cadence/hifi/operators/quantized_linear_out.cpp index 0f56a1a963..accc610132 100644 --- a/backends/cadence/hifi/operators/quantized_linear_out.cpp +++ b/backends/cadence/hifi/operators/quantized_linear_out.cpp @@ -26,6 +26,9 @@ using ::executorch::aten::Tensor; using ::executorch::runtime::getLeadingDims; using ::executorch::runtime::KernelRuntimeContext; + + // The nnlib kernel to compute quantized linear via matmul. + void _quantized_linear_asym8u( const Tensor& in, const Tensor& weight, @@ -37,37 +40,30 @@ void _quantized_linear_asym8u( int64_t out_zero_point, __ET_UNUSED const optional& offset, Tensor& out) { - // input comes in shape [leading_dims, in_dim] - // weight comes in shape [out_dim, in_dim] - // output comes in empty with shape [leading_dims, out_dim] - // Perform matrix multiply (M x N) x (N x P)' => M x P const int64_t leading_dims = getLeadingDims(in, in.dim() - 1); const int64_t out_dim = weight.size(0); // = out_dim const int64_t in_dim = weight.size(1); // = in_dim - const uint8_t* __restrict__ in_data = in.const_data_ptr(); const uint8_t* __restrict__ weight_data = weight.const_data_ptr(); const int32_t* __restrict__ bias_data = bias.const_data_ptr(); uint8_t* __restrict__ out_data = out.mutable_data_ptr(); - - // The nnlib kernel to compute quantized linear via matmul. int32_t ret = xa_nn_matmul_asym8uxasym8u_asym8u( - out_data, // p_out - weight_data, // p_mat1, - in_data, // p_mat2, - bias_data, // p_bias - out_dim, // rows of p_mat1 - in_dim, // cols of p_mat1 - in_dim, // row_stride of p_mat1 - leading_dims, // vec_count, i.e., rows of p_mat2 - in_dim, // vec_offset of p_mat2. - out_dim, // out_offset, i.e., offset of next output element written - 1, // out_stride, i.e., stride to go to next output row + out_data, + weight_data, + in_data, + bias_data, + out_dim, + in_dim, + in_dim, + leading_dims, + in_dim, + out_dim, + 1, -weight_zero_point.const_data_ptr()[0], // mat1_zero_bias -in_zero_point, // mat2_zero_bias - out_multiplier.const_data_ptr()[0], // out_multiplier - out_shift.const_data_ptr()[0], // out_shift - out_zero_point); // out_zero_bias + out_multiplier.const_data_ptr()[0], + out_shift.const_data_ptr()[0], + out_zero_point); ET_DCHECK_MSG(ret == 0, "HiFi quantized::linear failed"); } diff --git a/backends/cadence/hifi/third-party/nnlib/xa_nn_broadcast_32.c b/backends/cadence/hifi/third-party/nnlib/xa_nn_broadcast_32.c new file mode 100644 index 0000000000..cad3f1a25b --- /dev/null +++ b/backends/cadence/hifi/third-party/nnlib/xa_nn_broadcast_32.c @@ -0,0 +1,313 @@ +/******************************************************************************* +* Copyright (c) 2018-2024 Cadence Design Systems, Inc. +* +* Permission is hereby granted, free of charge, to any person obtaining +* a copy of this software and associated documentation files (the +* "Software"), to use this Software with Cadence processor cores only and +* not with any other processors and platforms, subject to +* the following conditions: +* +* The above copyright notice and this permission notice shall be included +* in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +******************************************************************************/ +/* + * xa_nn_broadcast_8_8.c + */ + +#include "xa_nnlib_common.h" +//#include "xa_nn_basic_state.h" + +#include +#include + +#include "stdio.h" + +/* + * This file is sourced from ../hifi5/xa_nn_broadcast_8_8.c + */ + +#define NUMDIMS_MAX 8 + +typedef struct bcast_expansion_struct_{ + size_t load_num_elem; + int replicate_loadedElm_times; + int repeat_operation; +} bcast_expansion_rule ; + +WORD32* broadcast_node_32(bcast_expansion_rule *steps, unsigned int step_id, + WORD32 *dst, WORD32 *src); + +void *xa_nn_memcpy(void * dest1,const void *src1, size_t n1) +{ + char *dest = (char *)dest1; + char *src = (char *)src1; + int n = (int)n1; + ae_int16x4 * __restrict d_align_addr, * __restrict s_align_addr; + int i; + void *orig_dest = dest; + + if (n < 32) { + return memcpy(dest, src, n); + } + + if ( !(((int) dest) %8) && !(((int) src) %8)) { // 64-bit aligned + s_align_addr = (ae_int16x4 *) src; + d_align_addr = (ae_int16x4 *) dest; + for (i=0; i>3; i++) { + d_align_addr[i] = s_align_addr[i]; + } + + for (i=(n&~7); i>3; i++) { + AE_LA16X4_IP(t, s_align, s_align_addr); + AE_LA16X4_IP(t2, s_align, s_align_addr); + AE_SA16X4_IP(t, d_align, d_align_addr); + AE_SA16X4_IP(t2, d_align, d_align_addr); + } + AE_SA64POS_FP(d_align, d_align_addr); + ae_int16 *s_src = (ae_int16 *) src; + ae_int16 *s_dest = (ae_int16 *) dest; + for (i=8*i; i8, -1); + + int i = 0; + + /* Check for valid IO shapes */ + for(i=0; i=0){ + + /* Find the sub-matrix size */ + while(in_shape[dim] != 1 && dim>=0){ + num_elem_load *= out_shape[dim]; + dim--; + } + + /* Find the number of times this sub-matrix needs to be copied */ + num_copy_times = 1; + while(in_shape[dim] == 1 && dim>=0){ + num_copy_times *= out_shape[dim]; + dim--; + } + + /* Find the number of times the above copy needs to be repeated */ + num_repeat = 1; + while(in_shape[dim] != 1 && dim>=0){ + num_repeat *= 1 * out_shape[dim]; + dim--; + } + + bcast_expansion_steps[k].load_num_elem = num_elem_load; + bcast_expansion_steps[k].replicate_loadedElm_times = num_copy_times; + bcast_expansion_steps[k].repeat_operation = num_repeat; + k++; + + num_elem_load = num_elem_load * num_copy_times * num_repeat; + } + + res = broadcast_node_32(bcast_expansion_steps, num_dims-1, + p_out, p_in); + (void)res; /* Unused return value */ + + return 0; +} + +WORD32* broadcast_node_32(bcast_expansion_rule *steps, unsigned int step_id, + WORD32 *dst, WORD32 *src) { + int step_itr=0, rep_itr=0; + int i=0, j=0, k=0; + bcast_expansion_rule *step = NULL; + + // ignore steps that are null + while(steps[step_id].repeat_operation == 0 && step_id>0){ + step_id--; + } + + // step is now the parent node for this iteration + step = &steps[step_id]; + size_t numLoadedElm = step->load_num_elem; + + WORD32 *cp_dst = dst; + WORD32 *cp_src = src; + WORD32 *cp_src_temp=NULL; + WORD32 *cp_dst_temp=NULL; + + if(numLoadedElm>32){ + if(step_id > 0){ + for(step_itr=0; step_itrrepeat_operation; step_itr++){ + src = broadcast_node_32(steps, step_id-1, dst, src); + cp_src = dst; + cp_dst = dst + numLoadedElm; + for(rep_itr=1; rep_itrreplicate_loadedElm_times; rep_itr++){ + xa_nn_memcpy(cp_dst, cp_src, 4 * numLoadedElm); + cp_dst += numLoadedElm; + } + dst = cp_dst; + } + return src; + } else { + if(numLoadedElm == 1){ + for(j=0; jrepeat_operation; j++){ +// memset((void*)cp_dst, (void*)cp_src, 4 * step->replicate_loadedElm_times); + for(i = 0; i < step->replicate_loadedElm_times; i++) + cp_dst[i] = cp_src[0]; + cp_dst += step->replicate_loadedElm_times; + cp_src++; + } + } else { + for(j=0; jrepeat_operation; j++){ + for(i=0; ireplicate_loadedElm_times; i++){ + xa_nn_memcpy(cp_dst, cp_src, 4 * numLoadedElm); + cp_dst += numLoadedElm; + } + cp_src += numLoadedElm; + } + } + return cp_src; + } + } + else{ + if(step_id > 0){ + for(step_itr=0; step_itrrepeat_operation; step_itr++){ + src = broadcast_node_32(steps, step_id-1, dst, src); + cp_src = dst; + cp_dst = dst + numLoadedElm; + for(rep_itr=1; rep_itrreplicate_loadedElm_times; rep_itr++){ + for(k=0; k<(int)numLoadedElm; k++){ + cp_src_temp = cp_src; + cp_dst_temp = cp_dst; + cp_dst_temp[k] = cp_src_temp[k]; + } + cp_dst += numLoadedElm; + } + dst = cp_dst; + } + return src; + } else { + if(numLoadedElm == 1){ + for(j=0; jrepeat_operation; j++){ +// memset((void*)cp_dst, *(WORD32 *)cp_src, 4 * step->replicate_loadedElm_times); + for(i = 0; i < step->replicate_loadedElm_times; i++) + cp_dst[i] = cp_src[0]; + cp_dst += step->replicate_loadedElm_times; + cp_src++; + } + } else { + for(j=0; j < step->repeat_operation; j++){ + for(i=0; i < step->replicate_loadedElm_times; i++){ + for(k=0; k<(int)(numLoadedElm); k++){ + cp_src_temp = cp_src; + cp_dst_temp = cp_dst; + cp_dst_temp[k] = cp_src_temp[k]; + + } + cp_dst += numLoadedElm; + } + cp_src += numLoadedElm; + } + } + return cp_src; + } + } +} diff --git a/backends/cadence/hifi/third-party/nnlib/xa_nn_broadcast_32_32.c b/backends/cadence/hifi/third-party/nnlib/xa_nn_broadcast_32_32.c new file mode 100644 index 0000000000..34a7111ee7 --- /dev/null +++ b/backends/cadence/hifi/third-party/nnlib/xa_nn_broadcast_32_32.c @@ -0,0 +1,313 @@ +/******************************************************************************* +* Copyright (c) 2018-2024 Cadence Design Systems, Inc. +* +* Permission is hereby granted, free of charge, to any person obtaining +* a copy of this software and associated documentation files (the +* "Software"), to use this Software with Cadence processor cores only and +* not with any other processors and platforms, subject to +* the following conditions: +* +* The above copyright notice and this permission notice shall be included +* in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +******************************************************************************/ +/* + * xa_nn_broadcast_32_32.c + */ + +#include "xa_nnlib_common.h" +//#include "xa_nn_basic_state.h" + +#include +#include + +#include "stdio.h" + +/* + * This file is sourced from ../hifi5/xa_nn_broadcast_8_8.c + */ + +#define NUMDIMS_MAX 8 + +typedef struct bcast_expansion_struct_{ + size_t load_num_elem; + int replicate_loadedElm_times; + int repeat_operation; +} bcast_expansion_rule ; + +WORD32* broadcast_node_32(bcast_expansion_rule *steps, unsigned int step_id, + WORD32 *dst, WORD32 *src); + +void *xa_nn_memcpy(void * dest1,const void *src1, size_t n1) +{ + char *dest = (char *)dest1; + char *src = (char *)src1; + int n = (int)n1; + ae_int16x4 * __restrict d_align_addr, * __restrict s_align_addr; + int i; + void *orig_dest = dest; + + if (n < 32) { + return memcpy(dest, src, n); + } + + if ( !(((int) dest) %8) && !(((int) src) %8)) { // 64-bit aligned + s_align_addr = (ae_int16x4 *) src; + d_align_addr = (ae_int16x4 *) dest; + for (i=0; i>3; i++) { + d_align_addr[i] = s_align_addr[i]; + } + + for (i=(n&~7); i>3; i++) { + AE_LA16X4_IP(t, s_align, s_align_addr); + AE_LA16X4_IP(t2, s_align, s_align_addr); + AE_SA16X4_IP(t, d_align, d_align_addr); + AE_SA16X4_IP(t2, d_align, d_align_addr); + } + AE_SA64POS_FP(d_align, d_align_addr); + ae_int16 *s_src = (ae_int16 *) src; + ae_int16 *s_dest = (ae_int16 *) dest; + for (i=8*i; i8, -1); + + int i = 0; + + /* Check for valid IO shapes */ + for(i=0; i=0){ + + /* Find the sub-matrix size */ + while(in_shape[dim] != 1 && dim>=0){ + num_elem_load *= out_shape[dim]; + dim--; + } + + /* Find the number of times this sub-matrix needs to be copied */ + num_copy_times = 1; + while(in_shape[dim] == 1 && dim>=0){ + num_copy_times *= out_shape[dim]; + dim--; + } + + /* Find the number of times the above copy needs to be repeated */ + num_repeat = 1; + while(in_shape[dim] != 1 && dim>=0){ + num_repeat *= 1 * out_shape[dim]; + dim--; + } + + bcast_expansion_steps[k].load_num_elem = num_elem_load; + bcast_expansion_steps[k].replicate_loadedElm_times = num_copy_times; + bcast_expansion_steps[k].repeat_operation = num_repeat; + k++; + + num_elem_load = num_elem_load * num_copy_times * num_repeat; + } + + res = broadcast_node_32(bcast_expansion_steps, num_dims-1, + p_out, p_in); + (void)res; /* Unused return value */ + + return 0; +} + +WORD32* broadcast_node_32(bcast_expansion_rule *steps, unsigned int step_id, + WORD32 *dst, WORD32 *src) { + int step_itr=0, rep_itr=0; + int i=0, j=0, k=0; + bcast_expansion_rule *step = NULL; + + // ignore steps that are null + while(steps[step_id].repeat_operation == 0 && step_id>0){ + step_id--; + } + + // step is now the parent node for this iteration + step = &steps[step_id]; + size_t numLoadedElm = step->load_num_elem; + + WORD32 *cp_dst = dst; + WORD32 *cp_src = src; + WORD32 *cp_src_temp=NULL; + WORD32 *cp_dst_temp=NULL; + + if(numLoadedElm>32){ + if(step_id > 0){ + for(step_itr=0; step_itrrepeat_operation; step_itr++){ + src = broadcast_node_32(steps, step_id-1, dst, src); + cp_src = dst; + cp_dst = dst + numLoadedElm; + for(rep_itr=1; rep_itrreplicate_loadedElm_times; rep_itr++){ + xa_nn_memcpy(cp_dst, cp_src, 4 * numLoadedElm); + cp_dst += numLoadedElm; + } + dst = cp_dst; + } + return src; + } else { + if(numLoadedElm == 1){ + for(j=0; jrepeat_operation; j++){ +// memset((void*)cp_dst, (void*)cp_src, 4 * step->replicate_loadedElm_times); + for(i = 0; i < step->replicate_loadedElm_times; i++) + cp_dst[i] = cp_src[0]; + cp_dst += step->replicate_loadedElm_times; + cp_src++; + } + } else { + for(j=0; jrepeat_operation; j++){ + for(i=0; ireplicate_loadedElm_times; i++){ + xa_nn_memcpy(cp_dst, cp_src, 4 * numLoadedElm); + cp_dst += numLoadedElm; + } + cp_src += numLoadedElm; + } + } + return cp_src; + } + } + else{ + if(step_id > 0){ + for(step_itr=0; step_itrrepeat_operation; step_itr++){ + src = broadcast_node_32(steps, step_id-1, dst, src); + cp_src = dst; + cp_dst = dst + numLoadedElm; + for(rep_itr=1; rep_itrreplicate_loadedElm_times; rep_itr++){ + for(k=0; k<(int)numLoadedElm; k++){ + cp_src_temp = cp_src; + cp_dst_temp = cp_dst; + cp_dst_temp[k] = cp_src_temp[k]; + } + cp_dst += numLoadedElm; + } + dst = cp_dst; + } + return src; + } else { + if(numLoadedElm == 1){ + for(j=0; jrepeat_operation; j++){ +// memset((void*)cp_dst, *(WORD32 *)cp_src, 4 * step->replicate_loadedElm_times); + for(i = 0; i < step->replicate_loadedElm_times; i++) + cp_dst[i] = cp_src[0]; + cp_dst += step->replicate_loadedElm_times; + cp_src++; + } + } else { + for(j=0; j < step->repeat_operation; j++){ + for(i=0; i < step->replicate_loadedElm_times; i++){ + for(k=0; k<(int)(numLoadedElm); k++){ + cp_src_temp = cp_src; + cp_dst_temp = cp_dst; + cp_dst_temp[k] = cp_src_temp[k]; + + } + cp_dst += numLoadedElm; + } + cp_src += numLoadedElm; + } + } + return cp_src; + } + } +} diff --git a/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_minimum_maximum_f32.c b/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_minimum_maximum_f32.c new file mode 100644 index 0000000000..3af93fc00c --- /dev/null +++ b/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_minimum_maximum_f32.c @@ -0,0 +1,847 @@ +/******************************************************************************* +* Copyright (c) 2018-2024 Cadence Design Systems, Inc. +* +* Permission is hereby granted, free of charge, to any person obtaining +* a copy of this software and associated documentation files (the +* "Software"), to use this Software with Cadence processor cores only and +* not with any other processors and platforms, subject to +* the following conditions: +* +* The above copyright notice and this permission notice shall be included +* in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +******************************************************************************/ +#include "nnlib-hifi4/xa_nnlib/include/xa_type_def.h" +#include "nnlib-hifi4/xa_nnlib/algo/common/include/xa_nnlib_common_fpu.h" +#include "nnlib-hifi4/xa_nnlib/algo/common/include/xa_nn_common.h" +#include "nnlib-hifi4/xa_nnlib/algo/common/include/xa_nnlib_err_chk.h" +#include "nnlib-hifi4/xa_nnlib/algo/kernels/basic/hifi4/xa_nn_basic_state.h" +#include "nnlib-hifi4/xa_nnlib/include/nnlib/xa_nnlib_kernels_api.h" + +#if !HAVE_VFPU +DISCARD_FUN_FOR_NONVOID_RETURN( + WORD32, xa_nn_elm_maximum_f32xf32_f32, + ( + FLOAT32 *p_out, + const FLOAT32 *p_inp1, + const FLOAT32 *p_inp2, + WORD32 num_elm + ) + ) +#else +WORD32 xa_nn_elm_maximum_f32xf32_f32(FLOAT32 * __restrict__ p_out, + const FLOAT32 * __restrict__ p_inp1, + const FLOAT32 * __restrict__ p_inp2, + WORD32 num_elm) +{ + + /* NULL pointer checks */ + XA_NNLIB_ARG_CHK_PTR(p_out, -1); + XA_NNLIB_ARG_CHK_PTR(p_inp1, -1); + XA_NNLIB_ARG_CHK_PTR(p_inp2, -1); + /* Pointer alignment checks */ + XA_NNLIB_ARG_CHK_ALIGN(p_out, sizeof(FLOAT32), -1); + XA_NNLIB_ARG_CHK_ALIGN(p_inp1, sizeof(FLOAT32), -1); + XA_NNLIB_ARG_CHK_ALIGN(p_inp2, sizeof(FLOAT32), -1); + /* Basic Parameter checks */ + XA_NNLIB_ARG_CHK_COND((num_elm <= 0), -1); + + int i; + xtfloatx2 *inp1 = (xtfloatx2 *)p_inp1; + xtfloatx2 *inp2 = (xtfloatx2 *)p_inp2; + xtfloatx2 *out = (xtfloatx2 *)p_out; + xtfloatx2 x1, x2, y; + unsigned char con1, con2; + xtbool2 con = int32_rtor_xtbool2(0x00000003); + + if(((((unsigned)p_out)&7) == 0) && ((((unsigned)p_inp1)&7) == 0) && ((((unsigned)p_inp2)&7) == 0)) + { + for(i=0;i < num_elm>>1;i++) + { + XT_LSX2IP(x1, inp1, 2*sizeof(FLOAT32)); + XT_LSX2IP(x2, inp2, 2*sizeof(FLOAT32)); + y = XT_MAX_SX2(x2, x1); + XT_SSX2IP( y, out, 2*sizeof(FLOAT32)); + } + } + else + { + ae_valign inp1_a, inp2_a, out_a; + + inp1_a = XT_LASX2PP(inp1); + inp2_a = XT_LASX2PP(inp2); + out_a = AE_ZALIGN64(); + /* Each iteration of loop is independent so safe to use concurrent pragma */ +#pragma concurrent + for(i=0;i < num_elm>>1;i++) + { + XT_LASX2IP(x1, inp1_a, inp1); + XT_LASX2IP(x2, inp2_a, inp2); + y = XT_MAX_SX2(x2, x1); + XT_SASX2IP(y, out_a, out); + } + XT_SASX2POSFP(out_a, out); + } + // Remainder Loop + if (num_elm & 1) + { + xtfloat a1, a2, a; + XT_LSIP(a1, (xtfloat *)inp1, 0); + XT_LSIP(a2, (xtfloat *)inp2, 0); + a = XT_MAX_S(a1, a2); + XT_SSI(a, (xtfloat *)out, 0); + } + return 0; +} +#endif + +#if HAVE_VFPU +static void internal_elm_maximum_broadcast_2D_f32xf32_f32(FLOAT32 * __restrict__ p_out, + const FLOAT32 * __restrict__ p_inp1, + const FLOAT32 * __restrict__ p_inp2, + WORD32 out_lc, + WORD32 in_lc, + xtbool sign_flag) +{ + int i, j; + + xtfloatx2 * __restrict__ p_a = (xtfloatx2 *)p_inp1; + xtfloatx2 * __restrict__ p_b = (xtfloatx2 *)p_inp2; + xtfloatx2 *__restrict__ p_c = (xtfloatx2 *)p_out; + + int num_simd2_ops; + int num_scalar_ops; + + if(out_lc) + { + num_simd2_ops = in_lc >> 1; + num_scalar_ops = in_lc & 1; + } + else + { + num_simd2_ops = (in_lc >> 2) << 1; + num_scalar_ops = in_lc & 3; + } + + xtfloatx2 x1, x2, y; + xtfloat a0, b0, c0; + + for(i = 0; i < out_lc; i++) + { + p_a = (xtfloatx2 *)&p_inp1[i * in_lc]; + p_b = (xtfloatx2 *)p_inp2; + p_c = (xtfloatx2 *)&p_out[i * in_lc]; + if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_b)&7) == 0) && ((((unsigned)p_c)&7) == 0)) + { + for(j = 0; j < num_simd2_ops; j++) + { + XT_LSX2IP(x1, p_a, 2 * sizeof(FLOAT32)); + XT_LSX2IP(x2, p_b, 2 * sizeof(FLOAT32)); + y = XT_MAX_SX2(x2, x1); + XT_SSX2IP(y, p_c, 2 * sizeof(FLOAT32)); + } + } + else + { + ae_valign vinp1, vinp2, out_a = AE_ZALIGN64(); + vinp1 = XT_LASX2PP(p_a); + vinp2 = XT_LASX2PP(p_b); + for(j = 0; j < num_simd2_ops; j++) + { + XT_LASX2IP(x1, vinp1, p_a); + XT_LASX2IP(x2, vinp2, p_b); + y = XT_MAX_SX2(x2, x1); + XT_SASX2IP(y, out_a, p_c); + } + XT_SASX2POSFP(out_a, (xtfloatx2 *)p_c); + } + if(num_scalar_ops !=0) + { + XT_LSIP(a0, (xtfloat *)p_a, sizeof(FLOAT32)); + XT_LSIP(b0, (xtfloat *)p_b, sizeof(FLOAT32)); + c0 = XT_MAX_S(b0, a0); + XT_SSI(c0, (xtfloat *)p_c, 0); + } + } +} + +static void internal_elm_maximum_broadcast_f32xf32_f32(FLOAT32 * __restrict__ p_out, + const FLOAT32 * __restrict__ p_inp1, + const FLOAT32 * __restrict__ p_inp2, + WORD32 num_elm, + xtbool sign_flag) +{ + int i; + xtfloatx2 * __restrict__ p_a = (xtfloatx2 *)p_inp1; + xtfloatx2 * __restrict__ p_b = (xtfloatx2 *)p_inp2; + xtfloatx2 *__restrict__ p_c = (xtfloatx2 *)p_out; + + const int num_simd2_ops = num_elm >> 1; + const int num_scalar_ops = num_elm & 1; + + xtfloat a0_7, out; + xtfloatx2 x1, x2, y; + x2 = XT_LSI((xtfloat *)p_b, 0); + + if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_c)&7) == 0)) + { + for(i=0; i p_inp2_shape[i] ? p_inp1_shape[i] : p_inp2_shape[i]))) + { + return -1; + } + } + + WORD32 inp1_strides[4], inp2_strides[4]; + inp1_strides[3] = 1; + inp2_strides[3] = 1; + for(i = 2; i >= 0; i--) + { + ae_int32x2 d_str, d_shape; + d_str = AE_MOVDA32X2(inp1_strides[i + 1], inp2_strides[i + 1]); + d_shape = AE_MOVDA32X2(p_inp1_shape[i + 1], p_inp2_shape[i + 1]); + d_str = AE_MULP32X2(d_str, d_shape); + inp1_strides[i] = AE_MOVAD32_H(d_str); + inp2_strides[i] = AE_MOVAD32_L(d_str); + } + + int need_broadcast = 0; + int inp1_const = 1, inp2_const = 1; + for(i = 0; i < 4; i++) + { + if(p_inp1_shape[i] != p_inp2_shape[i]) + { + if(p_inp1_shape[i] == 1) + inp1_strides[i] = 0; + else + inp2_strides[i] = 0; + + need_broadcast = 1; + } + if(p_inp1_shape[i] != 1) + inp1_const &= 0; + if(p_inp2_shape[i] != 1) + inp2_const &= 0; + } + int itr0, itr1, itr2; + + FLOAT32 *p_out_tmp = p_out; + const FLOAT32 *__restrict__ p_inp1_tmp = p_inp1; + const FLOAT32 *__restrict__ p_inp2_tmp = p_inp2; + if(need_broadcast == 0) + { + sign_flag = 0; + internal_elm_maximum_broadcast_2D_f32xf32_f32( + p_out, + p_inp1, + p_inp2, + 1, + p_out_shape[0] * inp1_strides[0], + sign_flag); + } + else if(inp1_strides[3] == inp2_strides[3]) + { + WORD32 in_lc, out_lc; + sign_flag = 0; + in_lc = p_out_shape[2] * p_out_shape[3]; + out_lc = 1; + if(inp1_strides[2] == 0) + { + const FLOAT32 *tmp; + tmp = p_inp1_tmp; p_inp1_tmp = p_inp2_tmp; p_inp2_tmp = tmp; + sign_flag = 1; + int tmp_strides[2]; + tmp_strides[0] = inp1_strides[0]; + tmp_strides[1] = inp1_strides[1]; + + inp1_strides[0] = inp2_strides[0]; + inp1_strides[1] = inp2_strides[1]; + + inp2_strides[0] = tmp_strides[0]; + inp2_strides[1] = tmp_strides[1]; + in_lc = p_out_shape[3]; + out_lc = p_out_shape[2]; + } + else if(inp2_strides[2] == 0) + { + in_lc = p_out_shape[3]; + out_lc = p_out_shape[2]; + } + + for(itr0 = 0; itr0 < p_out_shape[0]; itr0++) + { + const FLOAT32 *__restrict__ p_inp1_tmp0 = p_inp1_tmp; + const FLOAT32 *__restrict__ p_inp2_tmp0 = p_inp2_tmp; + for(itr1 = 0; itr1 < p_out_shape[1]; itr1++) + { + internal_elm_maximum_broadcast_2D_f32xf32_f32( + p_out_tmp, + p_inp1_tmp0, + p_inp2_tmp0, + out_lc, + in_lc, + sign_flag); + p_out_tmp += in_lc * out_lc; + p_inp1_tmp0 += inp1_strides[1]; + p_inp2_tmp0 += inp2_strides[1]; + } + p_inp1_tmp += inp1_strides[0]; + p_inp2_tmp += inp2_strides[0]; + } + } + else if(inp1_const == 1 || inp2_const == 1) + { + sign_flag = 0; + if(inp1_strides[3] == 0) + { + sign_flag = 1; + const FLOAT32 *tmp; + tmp = p_inp1_tmp; p_inp1_tmp = p_inp2_tmp; p_inp2_tmp = tmp; + } + internal_elm_maximum_broadcast_f32xf32_f32( + p_out_tmp, + p_inp1_tmp, + p_inp2_tmp, + p_out_shape[0] * p_out_shape[1] * p_out_shape[2] * p_out_shape[3], + sign_flag); + } + else + { + sign_flag = 0; + if(inp1_strides[3] == 0) + { + const FLOAT32 *tmp; + tmp = p_inp1_tmp; p_inp1_tmp = p_inp2_tmp; p_inp2_tmp = tmp; + sign_flag = 1; + int tmp_strides[3]; + tmp_strides[0] = inp1_strides[0]; + tmp_strides[1] = inp1_strides[1]; + tmp_strides[2] = inp1_strides[2]; + + inp1_strides[0] = inp2_strides[0]; + inp1_strides[1] = inp2_strides[1]; + inp1_strides[2] = inp2_strides[2]; + + inp2_strides[0] = tmp_strides[0]; + inp2_strides[1] = tmp_strides[1]; + inp2_strides[2] = tmp_strides[2]; + } + for(itr0 = 0; itr0 < p_out_shape[0]; itr0++) + { + const FLOAT32 *__restrict__ p_inp1_tmp0 = p_inp1_tmp; + const FLOAT32 *__restrict__ p_inp2_tmp0 = p_inp2_tmp; + for(itr1 = 0; itr1 < p_out_shape[1]; itr1++) + { + const FLOAT32 *__restrict__ p_inp1_tmp1 = p_inp1_tmp0; + const FLOAT32 *__restrict__ p_inp2_tmp1 = p_inp2_tmp0; + for(itr2 = 0; itr2 < p_out_shape[2]; itr2++) + { + { + internal_elm_maximum_broadcast_f32xf32_f32( + p_out_tmp, + p_inp1_tmp1, + p_inp2_tmp1, + p_out_shape[3], + sign_flag); + } + p_out_tmp += p_out_shape[3]; + p_inp1_tmp1 += inp1_strides[2]; + p_inp2_tmp1 += inp2_strides[2]; + } + p_inp1_tmp0 += inp1_strides[1]; + p_inp2_tmp0 += inp2_strides[1]; + } + p_inp1_tmp += inp1_strides[0]; + p_inp2_tmp += inp2_strides[0]; + } + } + return 0; +} +#endif + +#if !HAVE_VFPU +DISCARD_FUN_FOR_NONVOID_RETURN( + WORD32, xa_nn_elm_minimum_f32xf32_f32, + ( + FLOAT32 *p_out, + const FLOAT32 *p_inp1, + const FLOAT32 *p_inp2, + WORD32 num_elm + ) + ) +#else +WORD32 xa_nn_elm_minimum_f32xf32_f32(FLOAT32 * __restrict__ p_out, + const FLOAT32 * __restrict__ p_inp1, + const FLOAT32 * __restrict__ p_inp2, + WORD32 num_elm) +{ + + /* NULL pointer checks */ + XA_NNLIB_ARG_CHK_PTR(p_out, -1); + XA_NNLIB_ARG_CHK_PTR(p_inp1, -1); + XA_NNLIB_ARG_CHK_PTR(p_inp2, -1); + /* Pointer alignment checks */ + XA_NNLIB_ARG_CHK_ALIGN(p_out, sizeof(FLOAT32), -1); + XA_NNLIB_ARG_CHK_ALIGN(p_inp1, sizeof(FLOAT32), -1); + XA_NNLIB_ARG_CHK_ALIGN(p_inp2, sizeof(FLOAT32), -1); + /* Basic Parameter checks */ + XA_NNLIB_ARG_CHK_COND((num_elm <= 0), -1); + + int i; + xtfloatx2 *inp1 = (xtfloatx2 *)p_inp1; + xtfloatx2 *inp2 = (xtfloatx2 *)p_inp2; + xtfloatx2 *out = (xtfloatx2 *)p_out; + xtfloatx2 x1, x2, y; + unsigned char con1, con2; + xtbool2 con = int32_rtor_xtbool2(0x00000003); + + if(((((unsigned)p_out)&7) == 0) && ((((unsigned)p_inp1)&7) == 0) && ((((unsigned)p_inp2)&7) == 0)) + { + for(i=0;i < num_elm>>1;i++) + { + XT_LSX2IP(x1, inp1, 2*sizeof(FLOAT32)); + XT_LSX2IP(x2, inp2, 2*sizeof(FLOAT32)); + y = XT_MIN_SX2(x2, x1); + XT_SSX2IP( y, out, 2*sizeof(FLOAT32)); + } + } + else + { + ae_valign inp1_a, inp2_a, out_a; + + inp1_a = XT_LASX2PP(inp1); + inp2_a = XT_LASX2PP(inp2); + out_a = AE_ZALIGN64(); + /* Each iteration of loop is independent so safe to use concurrent pragma */ +#pragma concurrent + for(i=0;i < num_elm>>1;i++) + { + XT_LASX2IP(x1, inp1_a, inp1); + XT_LASX2IP(x2, inp2_a, inp2); + y = XT_MIN_SX2(x2, x1); + XT_SASX2IP(y, out_a, out); + } + XT_SASX2POSFP(out_a, out); + } + // Remainder Loop + if (num_elm & 1) + { + xtfloat a1, a2, a; + XT_LSIP(a1, (xtfloat *)inp1, 0); + XT_LSIP(a2, (xtfloat *)inp2, 0); + a = XT_MIN_S(a1, a2); + XT_SSI(a, (xtfloat *)out, 0); + } + return 0; +} +#endif + +#if HAVE_VFPU +static void internal_elm_minimum_broadcast_2D_f32xf32_f32(FLOAT32 * __restrict__ p_out, + const FLOAT32 * __restrict__ p_inp1, + const FLOAT32 * __restrict__ p_inp2, + WORD32 out_lc, + WORD32 in_lc, + xtbool sign_flag) +{ + int i, j; + + xtfloatx2 * __restrict__ p_a = (xtfloatx2 *)p_inp1; + xtfloatx2 * __restrict__ p_b = (xtfloatx2 *)p_inp2; + xtfloatx2 *__restrict__ p_c = (xtfloatx2 *)p_out; + + int num_simd2_ops; + int num_scalar_ops; + + if(out_lc) + { + num_simd2_ops = in_lc >> 1; + num_scalar_ops = in_lc & 1; + } + else + { + num_simd2_ops = (in_lc >> 2) << 1; + num_scalar_ops = in_lc & 3; + } + + xtfloatx2 x1, x2, y; + xtfloat a0, b0, c0; + + for(i = 0; i < out_lc; i++) + { + p_a = (xtfloatx2 *)&p_inp1[i * in_lc]; + p_b = (xtfloatx2 *)p_inp2; + p_c = (xtfloatx2 *)&p_out[i * in_lc]; + if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_b)&7) == 0) && ((((unsigned)p_c)&7) == 0)) + { + for(j = 0; j < num_simd2_ops; j++) + { + XT_LSX2IP(x1, p_a, 2 * sizeof(FLOAT32)); + XT_LSX2IP(x2, p_b, 2 * sizeof(FLOAT32)); + y = XT_MIN_SX2(x2, x1); + XT_SSX2IP(y, p_c, 2 * sizeof(FLOAT32)); + } + } + else + { + ae_valign vinp1, vinp2, out_a = AE_ZALIGN64(); + vinp1 = XT_LASX2PP(p_a); + vinp2 = XT_LASX2PP(p_b); + for(j = 0; j < num_simd2_ops; j++) + { + XT_LASX2IP(x1, vinp1, p_a); + XT_LASX2IP(x2, vinp2, p_b); + y = XT_MIN_SX2(x2, x1); + XT_SASX2IP(y, out_a, p_c); + } + XT_SASX2POSFP(out_a, (xtfloatx2 *)p_c); + } + if(num_scalar_ops !=0) + { + XT_LSIP(a0, (xtfloat *)p_a, sizeof(FLOAT32)); + XT_LSIP(b0, (xtfloat *)p_b, sizeof(FLOAT32)); + c0 = XT_MIN_S(b0, a0); + XT_SSI(c0, (xtfloat *)p_c, 0); + } + } +} + +static void internal_elm_minimum_broadcast_f32xf32_f32(FLOAT32 * __restrict__ p_out, + const FLOAT32 * __restrict__ p_inp1, + const FLOAT32 * __restrict__ p_inp2, + WORD32 num_elm, + xtbool sign_flag) +{ + int i; + xtfloatx2 * __restrict__ p_a = (xtfloatx2 *)p_inp1; + xtfloatx2 * __restrict__ p_b = (xtfloatx2 *)p_inp2; + xtfloatx2 *__restrict__ p_c = (xtfloatx2 *)p_out; + + const int num_simd2_ops = num_elm >> 1; + const int num_scalar_ops = num_elm & 1; + + xtfloat a0_7, out; + xtfloatx2 x1, x2, y; + x2 = XT_LSI((xtfloat *)p_b, 0); + + if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_c)&7) == 0)) + { + for(i=0; i p_inp2_shape[i] ? p_inp1_shape[i] : p_inp2_shape[i]))) + { + return -1; + } + } + + WORD32 inp1_strides[4], inp2_strides[4]; + inp1_strides[3] = 1; + inp2_strides[3] = 1; + for(i = 2; i >= 0; i--) + { + ae_int32x2 d_str, d_shape; + d_str = AE_MOVDA32X2(inp1_strides[i + 1], inp2_strides[i + 1]); + d_shape = AE_MOVDA32X2(p_inp1_shape[i + 1], p_inp2_shape[i + 1]); + d_str = AE_MULP32X2(d_str, d_shape); + inp1_strides[i] = AE_MOVAD32_H(d_str); + inp2_strides[i] = AE_MOVAD32_L(d_str); + } + + int need_broadcast = 0; + int inp1_const = 1, inp2_const = 1; + for(i = 0; i < 4; i++) + { + if(p_inp1_shape[i] != p_inp2_shape[i]) + { + if(p_inp1_shape[i] == 1) + inp1_strides[i] = 0; + else + inp2_strides[i] = 0; + + need_broadcast = 1; + } + if(p_inp1_shape[i] != 1) + inp1_const &= 0; + if(p_inp2_shape[i] != 1) + inp2_const &= 0; + } + int itr0, itr1, itr2; + + FLOAT32 *p_out_tmp = p_out; + const FLOAT32 *__restrict__ p_inp1_tmp = p_inp1; + const FLOAT32 *__restrict__ p_inp2_tmp = p_inp2; + if(need_broadcast == 0) + { + sign_flag = 0; + internal_elm_minimum_broadcast_2D_f32xf32_f32( + p_out, + p_inp1, + p_inp2, + 1, + p_out_shape[0] * inp1_strides[0], + sign_flag); + } + else if(inp1_strides[3] == inp2_strides[3]) + { + WORD32 in_lc, out_lc; + sign_flag = 0; + in_lc = p_out_shape[2] * p_out_shape[3]; + out_lc = 1; + if(inp1_strides[2] == 0) + { + const FLOAT32 *tmp; + tmp = p_inp1_tmp; p_inp1_tmp = p_inp2_tmp; p_inp2_tmp = tmp; + sign_flag = 1; + int tmp_strides[2]; + tmp_strides[0] = inp1_strides[0]; + tmp_strides[1] = inp1_strides[1]; + + inp1_strides[0] = inp2_strides[0]; + inp1_strides[1] = inp2_strides[1]; + + inp2_strides[0] = tmp_strides[0]; + inp2_strides[1] = tmp_strides[1]; + in_lc = p_out_shape[3]; + out_lc = p_out_shape[2]; + } + else if(inp2_strides[2] == 0) + { + in_lc = p_out_shape[3]; + out_lc = p_out_shape[2]; + } + + for(itr0 = 0; itr0 < p_out_shape[0]; itr0++) + { + const FLOAT32 *__restrict__ p_inp1_tmp0 = p_inp1_tmp; + const FLOAT32 *__restrict__ p_inp2_tmp0 = p_inp2_tmp; + for(itr1 = 0; itr1 < p_out_shape[1]; itr1++) + { + internal_elm_minimum_broadcast_2D_f32xf32_f32( + p_out_tmp, + p_inp1_tmp0, + p_inp2_tmp0, + out_lc, + in_lc, + sign_flag); + p_out_tmp += in_lc * out_lc; + p_inp1_tmp0 += inp1_strides[1]; + p_inp2_tmp0 += inp2_strides[1]; + } + p_inp1_tmp += inp1_strides[0]; + p_inp2_tmp += inp2_strides[0]; + } + } + else if(inp1_const == 1 || inp2_const == 1) + { + sign_flag = 0; + if(inp1_strides[3] == 0) + { + sign_flag = 1; + const FLOAT32 *tmp; + tmp = p_inp1_tmp; p_inp1_tmp = p_inp2_tmp; p_inp2_tmp = tmp; + } + internal_elm_minimum_broadcast_f32xf32_f32( + p_out_tmp, + p_inp1_tmp, + p_inp2_tmp, + p_out_shape[0] * p_out_shape[1] * p_out_shape[2] * p_out_shape[3], + sign_flag); + } + else + { + sign_flag = 0; + if(inp1_strides[3] == 0) + { + const FLOAT32 *tmp; + tmp = p_inp1_tmp; p_inp1_tmp = p_inp2_tmp; p_inp2_tmp = tmp; + sign_flag = 1; + int tmp_strides[3]; + tmp_strides[0] = inp1_strides[0]; + tmp_strides[1] = inp1_strides[1]; + tmp_strides[2] = inp1_strides[2]; + + inp1_strides[0] = inp2_strides[0]; + inp1_strides[1] = inp2_strides[1]; + inp1_strides[2] = inp2_strides[2]; + + inp2_strides[0] = tmp_strides[0]; + inp2_strides[1] = tmp_strides[1]; + inp2_strides[2] = tmp_strides[2]; + } + for(itr0 = 0; itr0 < p_out_shape[0]; itr0++) + { + const FLOAT32 *__restrict__ p_inp1_tmp0 = p_inp1_tmp; + const FLOAT32 *__restrict__ p_inp2_tmp0 = p_inp2_tmp; + for(itr1 = 0; itr1 < p_out_shape[1]; itr1++) + { + const FLOAT32 *__restrict__ p_inp1_tmp1 = p_inp1_tmp0; + const FLOAT32 *__restrict__ p_inp2_tmp1 = p_inp2_tmp0; + for(itr2 = 0; itr2 < p_out_shape[2]; itr2++) + { + { + internal_elm_minimum_broadcast_f32xf32_f32( + p_out_tmp, + p_inp1_tmp1, + p_inp2_tmp1, + p_out_shape[3], + sign_flag); + } + p_out_tmp += p_out_shape[3]; + p_inp1_tmp1 += inp1_strides[2]; + p_inp2_tmp1 += inp2_strides[2]; + } + p_inp1_tmp0 += inp1_strides[1]; + p_inp2_tmp0 += inp2_strides[1]; + } + p_inp1_tmp += inp1_strides[0]; + p_inp2_tmp += inp2_strides[0]; + } + } + return 0; +} +#endif \ No newline at end of file diff --git a/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_pow_f32.c b/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_pow_f32.c new file mode 100644 index 0000000000..4dcec52f97 --- /dev/null +++ b/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_pow_f32.c @@ -0,0 +1,1151 @@ +/* ------------------------------------------------------------------------ */ +/* Copyright (c) 2018 by Cadence Design Systems, Inc. ALL RIGHTS RESERVED. */ +/* These coded instructions, statements, and computer programs ("Cadence */ +/* Libraries") are the copyrighted works of Cadence Design Systems Inc. */ +/* Cadence IP is licensed for use with Cadence processor cores only and */ +/* must not be used for any other processors and platforms. Your use of the */ +/* Cadence Libraries is subject to the terms of the license agreement you */ +/* have entered into with Cadence Design Systems, or a sublicense granted */ +/* to you by a direct Cadence licensee. */ +/* ------------------------------------------------------------------------ */ +/* IntegrIT, Ltd. www.integrIT.com, info@integrIT.com */ +/* */ +/* DSP Library */ +/* */ +/* This library contains copyrighted materials, trade secrets and other */ +/* proprietary information of IntegrIT, Ltd. This software is licensed for */ +/* use with Cadence processor cores only and must not be used for any other */ +/* processors and platforms. The license to use these sources was given to */ +/* Cadence, Inc. under Terms and Condition of a Software License Agreement */ +/* between Cadence, Inc. and IntegrIT, Ltd. */ +/* ------------------------------------------------------------------------ */ +/* Copyright (C) 2015-2018 IntegrIT, Limited. */ +/* All Rights Reserved. */ +/* ------------------------------------------------------------------------ */ +/* + NatureDSP Signal Processing Library. Vector mathematics + Vector operations + code optimized for HiFi4 core + IntegrIT, 2006-2018 +*/ + +#include "../include/NatureDSP_Signal_math.h" +#include "NatureDSP_types.h" +#include "xa_nn_common.h" + +/* Common helper macros. */ +#include "xa_nnlib_common_fpu.h" + +#include "xa_nnlib_common.h" +/* Constant tables. */ + +const union ufloat32uint32 ALIGN(8) xa_nnlib_pow2f_coef[] = +{ + { 0x39222a65 }, + { 0x3aaf931c }, + { 0x3c1d94fc }, + { 0x3d63578a }, + { 0x3e75fdf0 }, + { 0x3f317218 }, + { 0x3f800000 } + + //{ 0x3aaf931b }, + //{ 0x3c1e7220 }, + //{ 0x3d63578a }, + //{ 0x3e75fcc9 }, + //{ 0x3f317218 }, + //{ 0x3f800000 } + +}; + +const union ufloat32uint32 ALIGN(8) xa_nnlib_log2f_coef[] = +{ + { 0x3d726a49 }, + { 0x3dd91c88 }, + { 0x3ddde76c }, + { 0x3de21e63 }, + { 0x3dfe600b }, + { 0x3e124679 }, + { 0x3e2ab2f1 }, + { 0x3e4ccd1b }, + { 0x3e7fffde }, + { 0x3eaaaaaa }, + { 0x3f000000 }, + { 0x3f800000 }, + /* log2(e) */ + { 0x3fb8aa3b }, /* 1.4426950216 */ + { 0x32a57060 } /* 1.9259629891e-008 */ +}; + +const union ufloat32uint32 xa_nnlib_pow_plusInff ={0x7f800000}; + +const union ufloat32uint32 xa_nnlib_pow_qNaNf = { 0x7fc00000 }; + +#define MIN(a,b) ( (a)<(b) ? (a) : (b) ) +#define MAX(a,b) ( (a)>(b) ? (a) : (b) ) + +/*------------------------------------------------------------------------- + Power function + These routines calculate power function for 32-bit fixed-point numbers or + floating point numbers. + For the fixed point API, The base is represented in Q31, the exponent + is represented in Q6.25. Results are represented as normalized fixed point + number with separate mantissa in Q31 and exponent. + + Precision: + 32x32 32-bit inputs, 32-bit outputs + f floating point input, floating point output + + Accuracy: + 2 ULP for fixed point API + 2 ULP under condition that |y|<=100 + + Notes: +1. Scalar floating point raise to a power functions conform to ANSI C requirements on + standard math library functions in respect to treatment of errno and floating- + point exceptions. Vectorized function does not touch errno and may raise or not raise + floating point exceptions. +2. For floating point API, If x<0 is finite, y is finite and not an integer value, + then the respective result z is set to NaN +3. For fixed point API, function returns zero for all non-positive x. Fixed point + functions never touch errno + + Special cases: + x | y | Result | Extra Conditions + --------+--------+--------+--------------------- + floating point API + --------+--------+--------+--------------------- + +/-0 | y | +/-inf | odd y<0 + +/-0 | y | +inf | even y<0 + +/-0 | y | +/-0 | odd y>0 + +/-0 | y | 0 | even y>0 + +/-1 | +/-inf | 1 | + 1 | y | 1 | any y including NaN + x | +/-0 | 1 | any x including NaN + x | y | NaN | finite x<0 and finite + | | | non-integer y (see + | | | note 2) + x | -inf | +inf | |x|<1 + x | -inf | 0 | |x|>1 + x | +inf | 0 | |x|<1 + x | +inf | +inf | |x|>1 + -inf | y | -0 | y an odd integer <0 + -inf | y | 0 | y<0 and not an odd + | | | integer + -inf | y | -inf | y an odd integer >0 + -inf | y | +inf | y>0 and not an odd + | | | integer + +inf | y | 0 | y<0 + +inf | y | +inf | y>0 + --------+--------+--------+--------------------- + fixed point API + --------+--------+--------+--------------------- + x | y | 0 | x<=0 + --------+--------+--------+--------------------- + + Input: + x[N] input data,Q0.31 or floating point + y[N] input data,Q6.25 or floating point + N length of vectors + Output (fixed point API): + m[N] mantissa of output, Q31 + e[N] exponent of output + Output (floating point API): + z[N] results: floating point + + Restriction: + z,x,y,m should not overlap +-------------------------------------------------------------------------*/ + +#if !HAVE_VFPU && !HAVE_FPU +DISCARD_FUN(void, xa_nn_elm_pow_f32, (FLOAT32 * restrict z, const FLOAT32 * restrict y, const FLOAT32 * restrict x, WORD32 N)) +#elif HAVE_VFPU +#define sz_f32 (int)sizeof(FLOAT32) +static void mypowf(FLOAT32 * scr, + FLOAT32 * restrict z, + const FLOAT32 * restrict x, + const FLOAT32 * restrict y, + WORD32 N ) +{ + /* Table of different constants used in computations */ + static const int32_t c_tbl[] = + { + -126, + -150, + (int32_t)0x007FFFFF,/* max denormalized floating-point number / mantissa mask */ + (int32_t)0x4B800000,/* 2^24 */ + (int32_t)0x3F3504F3,/* sqrt(0.5) */ + (int32_t)0x3F000000,/* 0.5 */ + (int32_t)0xBF000000,/* -0.5 */ + -252, + 254 + }; + int n; + const xtfloatx2 * pX; + const xtfloatx2 * pY; + + const xtfloatx2 * restrict S_rd; + xtfloatx2 * restrict S_wr; + xtfloatx2 * restrict pZ; + const ae_int32 * restrict TBL; + const xtfloat * restrict TBL_LOG2; + const xtfloat * restrict TBL_POW2; + xtfloatx2 x0, y0, z0, t0, t1, ef0; + xtfloatx2 c2f, c3f, c4f; + xtfloatx2 _0, _1, half; + ae_int32x2 c0i, c1i, c5i, c7i, c8i; + ae_int32x2 e0, xi0, yi0, ex0; + xtbool2 bsx, bsy, bdenorm, bsmall; + ae_valign aX, aY, aZ; + + /* overall number of blocks; number of values in the current block */ + int blkLen; + /* Block size, blkLen <= blkSize */ + const int blkSize = MAX_ALLOCA_SZ / (3*sz_f32); + + + if (N <= 0) return; + + NASSERT(N % 2 == 0); + NASSERT_ALIGN16(scr); + + /* + * Data are processed in blocks of scratch area size. Further, the algorithm + * implementation is splitted in order to feed the optimizing compiler with a + * few loops of managable size. + */ + + + blkLen = 0; + TBL = (const ae_int32 *)c_tbl; + for (; N>0; N -= blkLen, x += blkSize, y += blkSize, z += blkSize) + { + blkLen = XT_MIN(N, blkSize); + _0 = 0.0f; + _1 = (1.0f); + half = (0.5f); + { + pX = (const xtfloatx2*)x; + S_wr = (xtfloatx2*)scr; + aX = AE_LA64_PP(pX); + for (n = 0; n<(blkLen >> 1); n++) + { + XT_LASX2IP(x0, aX, pX); + + x0 = XT_ABS_SX2(x0); + c0i = AE_L32_I(TBL, 0 * 4); /*-126*/ + c1i = AE_L32_I(TBL, 1 * 4); /*-150*/ + c2f = XT_LSI((xtfloat*)TBL, 2 * 4); + c3f = XT_LSI((xtfloat*)TBL, 3 * 4); + /* process denormalized values */ + bdenorm = XT_OLE_SX2(x0, c2f); + t0 = XT_MUL_SX2(x0, c3f); + XT_MOVT_SX2(x0, t0, bdenorm); + e0 = c0i; + AE_MOVT32X2(e0, c1i, bdenorm); + /* extract exponent */ + xi0 = XT_AE_MOVINT32X2_FROMXTFLOATX2(x0); + ex0 = AE_SRLI32(xi0, 23); + e0 = AE_ADD32(e0, ex0); + /* extract mantissa */ + ex0 = XT_AE_MOVINT32X2_FROMXTFLOATX2(c2f);/* load mantissa mask */ //!!!!!!!!!!!!! + c5i = AE_L32_I(TBL, 5 * 4);/* 0.5 */ + xi0 = AE_AND32(xi0, ex0); + xi0 = AE_OR32(xi0, c5i); + x0 = XT_AE_MOVXTFLOATX2_FROMINT32X2(xi0); + /* adjust the mantissa to range [ sqrt(0.5) ; sqrt(2.0) ) */ + c4f = XT_LSI((xtfloat*)TBL, 4 * 4); + bsmall = XT_OLT_SX2(x0, c4f); + t0 = XT_ADD_SX2(x0, x0); + ex0 = AE_SUB32(e0, 1); + XT_MOVT_SX2(x0, t0, bsmall); + AE_MOVT32X2(e0, ex0, bsmall); + x0 = XT_SUB_SX2(_1, x0); //!!! + ef0 = XT_FLOAT_SX2(e0, 0); //!!! + XT_SSX2IP(x0, S_wr, 2 * sz_f32); + XT_SSX2IP(ef0, S_wr, 2*2 * sz_f32); + } + } + __Pragma("no_reorder"); + /* */ + { + xtfloatx2 p0, p1, p2, p3, p4, p5, p6, p7, p8, p9; + xtfloatx2 p10, p11, p12, p13; + xtfloatx2 t2, w0, w1; + S_wr = ( xtfloatx2*)scr+2; + S_rd = (const xtfloatx2*)scr; + TBL_LOG2 = (const xtfloat *)xa_nnlib_log2f_coef; + for (n = 0; n<(blkLen >> 1); n++) + { + XT_LSX2IP(x0, S_rd, 3*2 * sz_f32); + //XT_LSX2IP(ef0, S_rd, 2 * sz_f32); + + /* evaluate polynomial approximation */ + /* Load table of coefficients */ + + p0 = XT_LSI(TBL_LOG2, 0 * 4); + p1 = XT_LSI(TBL_LOG2, 1 * 4); + p2 = XT_LSI(TBL_LOG2, 2 * 4); + p3 = XT_LSI(TBL_LOG2, 3 * 4); + p4 = XT_LSI(TBL_LOG2, 4 * 4); + p5 = XT_LSI(TBL_LOG2, 5 * 4); + p6 = XT_LSI(TBL_LOG2, 6 * 4); + p7 = XT_LSI(TBL_LOG2, 7 * 4); + p8 = XT_LSX(TBL_LOG2, 8 * 4); + p9 = XT_LSX(TBL_LOG2, 9 * 4); + + XT_MADD_SX2(p1, x0, p0); + XT_MADD_SX2(p2, x0, p1); + XT_MADD_SX2(p3, x0, p2); + XT_MADD_SX2(p4, x0, p3); + XT_MADD_SX2(p5, x0, p4); + XT_MADD_SX2(p6, x0, p5); + XT_MADD_SX2(p7, x0, p6); + XT_MADD_SX2(p8, x0, p7); + XT_MADD_SX2(p9, x0, p8); + t2 = p9; + XT_SSX2IP(t2, S_wr, 3*2 * sz_f32); + } + S_wr = (xtfloatx2*)scr; + S_rd = (const xtfloatx2*)scr; + for (n = 0; n<(blkLen >> 1); n++) + { + p10 = XT_LSX(TBL_LOG2, 10 * 4); + p11 = XT_LSX(TBL_LOG2, 11 * 4); + p12 = XT_LSX(TBL_LOG2, 12 * 4); + p13 = XT_LSX(TBL_LOG2, 13 * 4); + + XT_LSX2IP(x0, S_rd, 2 * sz_f32); + XT_LSX2IP(ef0, S_rd, 2 * sz_f32); + XT_LSX2IP(t2, S_rd, 2 * sz_f32); + /* next coefficients are computed in extended precision */ + t0 = XT_MUL_SX2(x0, t2); t1 = t0; + XT_MSUB_SX2(t1, x0, t2); + w0 = XT_ADD_SX2(t0, p10); + w1 = XT_SUB_SX2(w0, p10); + w1 = XT_SUB_SX2(t0, w1); + w1 = XT_SUB_SX2(w1, t1); + t0 = w0; t1 = w1; + w0 = XT_MUL_SX2(x0, t0); w1 = w0; + XT_MSUB_SX2(w1, x0, t0); t0 = w0; + XT_MSUB_SX2(w1, x0, t1); t1 = w1; + w0 = XT_ADD_SX2(t0, p11); + w1 = XT_SUB_SX2(w0, p11); + w1 = XT_SUB_SX2(t0, w1); + w1 = XT_SUB_SX2(w1, t1); + t0 = w0; t1 = w1; + x0 = XT_NEG_SX2(x0); + w0 = XT_MUL_SX2(x0, t0); w1 = w0; + XT_MSUB_SX2(w1, x0, t0); t0 = w0; + XT_MSUB_SX2(w1, x0, t1); t1 = w1; + /* multiply by log2(e) */ + w0 = XT_MUL_SX2(t0, p12); w1 = w0; + XT_MSUB_SX2(w1, t0, p12); + XT_MADD_SX2(w1, t1, p12); + XT_MSUB_SX2(w1, t0, p13); + t0 = w0; t1 = w1; + /* add exponent */ + w0 = XT_ADD_SX2(t0, ef0); + w1 = XT_SUB_SX2(w0, ef0); + w1 = XT_SUB_SX2(t0, w1); + t1 = XT_SUB_SX2(w1, t1);//!!!! + t0 = w0; // !!!!! + XT_SSX2IP(t0, S_wr, 2 * sz_f32); + XT_SSX2IP(t1, S_wr, 2*2 * sz_f32); + } + } + __Pragma("no_reorder"); + /* */ + { + xtfloatx2 xy, dxy, c0, c1; + xtfloatx2 p0, p1, p2, p3, p4, p5, p6; + S_wr = ( xtfloatx2*)scr+2; + S_rd = (const xtfloatx2*)scr; + TBL_POW2 = (const xtfloat *)xa_nnlib_pow2f_coef; + pY = (const xtfloatx2*)y; + aY = AE_LA64_PP(pY); + for (n = 0; n<(blkLen >> 1); n++) + { + XT_LSX2IP(t0, S_rd, 2 * sz_f32); + XT_LSX2IP(t1, S_rd, 2*2 * sz_f32); + + XT_LASX2IP(y0, aY, pY); + /* compute y*log2(x) and separate result into integer and fractional parts */ + xy = XT_FIROUND_SX2(XT_MUL_SX2(y0, t0)); + dxy = XT_NEG_SX2(xy); + XT_MADD_SX2(dxy, y0, t0); + XT_MADD_SX2(dxy, y0, t1); + dxy = XT_MIN_SX2(dxy, (xtfloatx2)1.0f); + dxy = XT_MAX_SX2(dxy, (xtfloatx2)-1.0f); + /* compute 2^fract */ + p0 = XT_LSI(TBL_POW2, 0 * 4); + p1 = XT_LSI(TBL_POW2, 1 * 4); + p2 = XT_LSI(TBL_POW2, 2 * 4); + p3 = XT_LSI(TBL_POW2, 3 * 4); + p4 = XT_LSI(TBL_POW2, 4 * 4); + + /* NOTE: do not change the order of computations and way of polynomial decomposition ! */ + XT_MADD_SX2(p1, dxy, p0); + XT_MADD_SX2(p2, dxy, p1); + XT_MADD_SX2(p3, dxy, p2); + XT_MADD_SX2(p4, dxy, p3); + XT_SSX2IP(p4, S_wr, 3*2 * sz_f32); + } + __Pragma("no_reorder"); + S_wr = (xtfloatx2*)scr; + S_rd = (const xtfloatx2*)scr; + TBL_POW2 = (const xtfloat *)xa_nnlib_pow2f_coef; + pY = (const xtfloatx2*)y; + aY = AE_LA64_PP(pY); + for (n = 0; n<(blkLen >> 1); n++) + { + + XT_LSX2IP(t0, S_rd, 2 * sz_f32); + XT_LSX2IP(t1, S_rd, 2 * sz_f32); + XT_LSX2IP(p4, S_rd, 2 * sz_f32); + p5 = XT_LSI(TBL_POW2, 5 * 4); + p6 = XT_LSI(TBL_POW2, 6 * 4); + XT_LASX2IP(y0, aY, pY); + /* compute y*log2(x) and separate result into integer and fractional parts */ + xy = XT_FIROUND_SX2(XT_MUL_SX2(y0, t0)); + dxy = XT_NEG_SX2(xy); + XT_MADD_SX2(dxy, y0, t0); + XT_MADD_SX2(dxy, y0, t1); + dxy = XT_MIN_SX2(dxy, (xtfloatx2)1.0f); + dxy = XT_MAX_SX2(dxy, (xtfloatx2)-1.0f); + XT_MADD_SX2(p5, dxy, p4); + XT_MADD_SX2(p6, dxy, p5); + z0 = p6; + /* apply integer part */ + e0 = XT_TRUNC_SX2(xy, 0); + c7i = AE_L32_I(TBL, 7 * 4);/* -252 */ + c8i = AE_L32_X(TBL, 8 * 4);/* 254 */ + e0 = AE_MAX32(e0, c7i); + e0 = AE_MIN32(e0, c8i); + e0 = AE_ADD32(e0, c8i); + ex0 = AE_SRAI32(e0, 1); + e0 = AE_SUB32(e0, ex0); + ex0 = AE_SLLI32(ex0, 23); + e0 = AE_SLLI32(e0, 23); + c0 = XT_AE_MOVXTFLOATX2_FROMINT32X2(e0); + c1 = XT_AE_MOVXTFLOATX2_FROMINT32X2(ex0); + z0 = XT_MUL_SX2(z0, c1); + z0 = XT_MUL_SX2(z0, c0); //!!!!!!!!!!!! + XT_SSX2IP(z0, S_wr, 2 * sz_f32); + } + } + __Pragma("no_reorder"); + /* */ + { + xtbool2 b_yint, b_e0, b0, b_notspec; + xtbool2 b_yeqz, b_yinf, b_xeqz, b_xeq1, b_xinf; + xtbool2 b_NaN1, b_NaN2, b_one, b_Inf, b_zero; + uint32_t b0i, b1i; + uint32_t yeqz, yinf, xeqz, xeq1, xinf, sx, sy, yint; + uint32_t one, NaN1, Inf, zero; + xtfloatx2 xabs, spec; + ae_int32x2 sgn, zi0; + + S_rd = (const xtfloatx2*)scr; + pY = (const xtfloatx2*)y; + pX = (const xtfloatx2*)x; + pZ = ( xtfloatx2*)z; + aY = AE_LA64_PP(pY); + aX = AE_LA64_PP(pX); + aZ = AE_ZALIGN64(); + for (n = 0; n<(blkLen >> 1); n++) + { + XT_LSX2IP(z0, S_rd, 2 * sz_f32); + XT_LASX2IP(x0, aX, pX); + XT_LASX2IP(y0, aY, pY); + /* Take sign of x and y */ + xi0 = XT_AE_MOVINT32X2_FROMXTFLOATX2(x0); + yi0 = XT_AE_MOVINT32X2_FROMXTFLOATX2(y0); + bsx = XT_OLT_SX2(xi0, (xtfloatx2)0.0f); + bsy = XT_OLT_SX2(yi0, (xtfloatx2)0.0f); + + xabs = XT_ABS_SX2(x0); + /* check if y is integer */ + t0 = XT_FITRUNC_SX2(y0); + b_yint = XT_OEQ_SX2(t0, y0); + + /* check if y is odd */ + e0 = XT_TRUNC_SX2(y0, 0); //temp0 + b_e0 = AE_EQ32(e0, MAX_INT32);//~b_tmp0 + b0i = AE_MOVAB2(b_e0); + b1i = AE_MOVAB2(b_yint); + b0i = b1i&(~b0i); + b0 = AE_MOVBA2(b0i); + AE_MOVF32X2(e0, AE_ZERO32(), b0); + e0 = AE_SLLI32(e0, 31); + sgn = AE_AND32(e0, xi0); + /* process special numbers */ + b_yeqz = XT_OEQ_SX2((xtfloatx2)0.0f, y0); /* y ==0 */ + b_yinf = XT_OEQ_SX2(XT_ABS_SX2(y0), xa_nnlib_pow_plusInff.f); /* |y|==Inf */ + b_xeqz = XT_OEQ_SX2(x0, (xtfloatx2)0.0f); /* x ==0 */ + b_xeq1 = XT_OEQ_SX2(xabs, (xtfloatx2)1.0f); /* |x|==1 */ + b_xinf = XT_OEQ_SX2(xabs, xa_nnlib_pow_plusInff.f); /* |x|==INF */ + + yint = AE_MOVAB2(b_yint); + yeqz = AE_MOVAB2(b_yeqz); + yinf = AE_MOVAB2(b_yinf); + xeqz = AE_MOVAB2(b_xeqz); + xeq1 = AE_MOVAB2(b_xeq1); + xinf = AE_MOVAB2(b_xinf); + sx = AE_MOVAB2(bsx); + sy = AE_MOVAB2(bsy); + one = xeq1 & (yinf | (~sx)); /* |x|==1 && ( |y|==Inf || x>0 ) */ + one = one | yeqz; /* ( |x|==1 && ( |y|==Inf || x>0 ) ) || y==0 --> z=1.0 */ + NaN1 = sx&(~yint); /* x<0 && y is not an integer --> z=NaN */ + Inf = xinf&(~sy); /* x==INF && y>0 --> z=INF */ + Inf = Inf | (xeqz & sy); /* x==0 && y<0 --> z=INF */ + zero = xeqz &(~sy); /* x==0 && y>0 --> z=0.0 */ + zero = zero | (xinf & sy); /* x==INF && y<0 --> z=0.0 */ + + b_NaN1 = AE_MOVBA2(NaN1); + b_NaN2 = XT_UN_SX2(x0, y0); /* isnan(x) || isnan(y) --> z=NaN */ + b_one = AE_MOVBA2(one); + b_Inf = AE_MOVBA2(Inf); + b_zero = AE_MOVBA2(zero); + + /* Save special numbers and mask for special numbers */ + spec = (xtfloatx2)xa_nnlib_pow_qNaNf.f; + XT_MOVF_SX2(spec, half, b_NaN1); + XT_MOVT_SX2(spec, _0, b_zero); + XT_MOVT_SX2(spec, xa_nnlib_pow_plusInff.f, b_Inf); + XT_MOVT_SX2(spec, xa_nnlib_pow_qNaNf.f, b_NaN2); + XT_MOVT_SX2(spec, _1, b_one); + + b_notspec = XT_OEQ_SX2(spec, half); + /* Replace result with special numbers if needed */ + XT_MOVF_SX2(z0, spec, b_notspec); + /* Restore sign and store result */ + zi0 = XT_AE_MOVINT32X2_FROMXTFLOATX2(z0); + zi0 = AE_XOR32(zi0, sgn); + z0 = XT_AE_MOVXTFLOATX2_FROMINT32X2(zi0); + XT_SASX2IP(z0, aZ, pZ); + } + } + XT_SASX2POSFP(aZ, pZ); + } +} /* mypowf() */ +void xa_nn_elm_pow_f32( FLOAT32 * restrict z, + const FLOAT32 * restrict x, + const FLOAT32 * restrict y, + int N ) +{ + const int blkSize = MAX_ALLOCA_SZ/sz_f32; + /* Allocate a fixed-size scratch area on the stack. */ + FLOAT32 ALIGN(16) scr[blkSize]; + int M; + if ( N<=0 ) return; + M=N&~1; + if ( M ) + { + mypowf(scr,z,x,y,M); + y += M; + x += M; + z += M; + N&=1; + } + if (N) + { // processing the tail + static const int32_t c_tbl[] = + { + -126, + -150, + (int32_t)0x007FFFFF,/* max denormalized floating-point number / mantissa mask */ + (int32_t)0x4B800000,/* 2^24 */ + (int32_t)0x3F3504F3,/* sqrt(0.5) */ + (int32_t)0x3F000000,/* 0.5 */ + (int32_t)0xBF000000,/* -0.5 */ + -252, + 254 + }; + xtfloat x0, y0, t0, ef0, t1, t2; + xtfloat xy, dxy, z0, c0, c1; + xtfloat p0, p1, p2, p3, p4, p5, p6, p7, p8, p9; + xtfloat p10, p11, p12, p13, w0, w1; + xtbool bdenorm, bsmall; + ae_int32 e0, xi0, ex0; + x0=XT_LSI((const xtfloat*)x,0); + + x0 = XT_ABS_S(x0); + + /* process denormalized values */ + bdenorm = xtbool2_extract_0(XT_OLE_S(x0, XT_LSI((xtfloat*)c_tbl, 2 * 4))); + t0 = XT_MUL_S(x0, XT_LSI((xtfloat*)c_tbl, 3 * 4)); + XT_MOVT_S(x0, t0, (bdenorm)); + e0 = AE_L32_I((ae_int32 *)c_tbl, 0 * 4);; + AE_MOVT_32(e0, AE_L32_I((ae_int32 *)c_tbl, 1 * 4), (bdenorm)); + /* extract exponent */ + xi0 = XT_AE_MOVINT32X2_FROMXTFLOATX2(x0); + ex0 = AE_SRLI32(xi0, 23); + e0 = AE_ADD32(e0, ex0); + /* extract mantissa */ + ex0 = XT_AE_MOVINT32X2_FROMXTFLOATX2(XT_LSI((xtfloat*)c_tbl, 2 * 4));/* load mantissa mask */ //!!!!!!!!!!!!! + xi0 = AE_AND32(xi0, ex0); + xi0 = AE_OR32(xi0, AE_L32_I((ae_int32 *)c_tbl, 5 * 4)); + x0 = XT_AE_MOVXTFLOATX2_FROMINT32X2(xi0); + /* adjust the mantissa to range [ sqrt(0.5) ; sqrt(2.0) ) */ + + bsmall = xtbool2_extract_0(XT_OLT_S(x0, XT_LSI((xtfloat*)c_tbl, 4 * 4))); + + + t0 = XT_ADD_S(x0, x0); + ex0 = AE_SUB32(e0, 1); + XT_MOVT_S(x0, t0, bsmall); + AE_MOVT_32(e0, ex0, bsmall); + x0 = XT_SUB_S(1.0f, x0); //!!! + ef0 = XT_FLOAT_S(e0, 0); //!!! + + /* evaluate polynomial approximation */ + /* Load table of coefficients */ + + p0 = XT_LSI((const xtfloat *)xa_nnlib_log2f_coef, 0 * 4); + p1 = XT_LSI((const xtfloat *)xa_nnlib_log2f_coef, 1 * 4); + p2 = XT_LSI((const xtfloat *)xa_nnlib_log2f_coef, 2 * 4); + p3 = XT_LSI((const xtfloat *)xa_nnlib_log2f_coef, 3 * 4); + p4 = XT_LSI((const xtfloat *)xa_nnlib_log2f_coef, 4 * 4); + p5 = XT_LSI((const xtfloat *)xa_nnlib_log2f_coef, 5 * 4); + p6 = XT_LSI((const xtfloat *)xa_nnlib_log2f_coef, 6 * 4); + p7 = XT_LSI((const xtfloat *)xa_nnlib_log2f_coef, 7 * 4); + p8 = XT_LSX((const xtfloat *)xa_nnlib_log2f_coef, 8 * 4); + p9 = XT_LSX((const xtfloat *)xa_nnlib_log2f_coef, 9 * 4); + + + XT_MADD_S(p1, x0, p0); + XT_MADD_S(p2, x0, p1); + XT_MADD_S(p3, x0, p2); + XT_MADD_S(p4, x0, p3); + XT_MADD_S(p5, x0, p4); + XT_MADD_S(p6, x0, p5); + XT_MADD_S(p7, x0, p6); + XT_MADD_S(p8, x0, p7); + XT_MADD_S(p9, x0, p8); + t2 = p9; + + + p10 = XT_LSX((const xtfloat *)xa_nnlib_log2f_coef, 10 * 4); + p11 = XT_LSX((const xtfloat *)xa_nnlib_log2f_coef, 11 * 4); + p12 = XT_LSX((const xtfloat *)xa_nnlib_log2f_coef, 12 * 4); + p13 = XT_LSX((const xtfloat *)xa_nnlib_log2f_coef, 13 * 4); + + /* next coefficients are computed in extended precision */ + t0 = XT_MUL_S(x0, t2); t1 = t0; + XT_MSUB_S(t1, x0, t2); + w0 = XT_ADD_S(t0, p10); + w1 = XT_SUB_S(w0, p10); + w1 = XT_SUB_S(t0, w1); + w1 = XT_SUB_S(w1, t1); + t0 = w0; t1 = w1; + w0 = XT_MUL_S(x0, t0); w1 = w0; + XT_MSUB_S(w1, x0, t0); t0 = w0; + XT_MSUB_S(w1, x0, t1); t1 = w1; + w0 = XT_ADD_S(t0, p11); + w1 = XT_SUB_S(w0, p11); + w1 = XT_SUB_S(t0, w1); + w1 = XT_SUB_S(w1, t1); + t0 = w0; t1 = w1; + x0 = XT_NEG_S(x0); + w0 = XT_MUL_S(x0, t0); w1 = w0; + XT_MSUB_S(w1, x0, t0); t0 = w0; + XT_MSUB_S(w1, x0, t1); t1 = w1; + /* multiply by log2(e) */ + w0 = XT_MUL_S(t0, p12); w1 = w0; + XT_MSUB_S(w1, t0, p12); + XT_MADD_S(w1, t1, p12); + XT_MSUB_S(w1, t0, p13); + t0 = w0; t1 = w1; + /* add exponent */ + w0 = XT_ADD_S(t0, ef0); + w1 = XT_SUB_S(w0, ef0); + w1 = XT_SUB_S(t0, w1); + t1 = XT_SUB_S(w1, t1);//!!!! + t0 = w0; // !!!!! + + /* compute y*log2(x) and separate result into integer and fractional parts */ + y0 = XT_LSI((const xtfloat*)y, 0); + xy = XT_FIROUND_S(XT_MUL_S(y0, t0)); + dxy = XT_NEG_S(xy); + XT_MADD_S(dxy, y0, t0); + XT_MADD_S(dxy, y0, t1); + dxy = XT_MIN_S(dxy, (xtfloatx2)1.0f); + dxy = XT_MAX_S(dxy, (xtfloatx2)-1.0f); + /* compute 2^fract */ + p0 = XT_LSI( (const xtfloat *)xa_nnlib_pow2f_coef, 0 * 4); + p1 = XT_LSI( (const xtfloat *)xa_nnlib_pow2f_coef, 1 * 4); + p2 = XT_LSI( (const xtfloat *)xa_nnlib_pow2f_coef, 2 * 4); + p3 = XT_LSI( (const xtfloat *)xa_nnlib_pow2f_coef, 3 * 4); + p4 = XT_LSI( (const xtfloat *)xa_nnlib_pow2f_coef, 4 * 4); + p5 = XT_LSI( (const xtfloat *)xa_nnlib_pow2f_coef, 5 * 4); + p6 = XT_LSI( (const xtfloat *)xa_nnlib_pow2f_coef, 6 * 4); + /* NOTE: do not change the order of computations and way of polynomial decomposition ! */ + XT_MADD_S(p1, dxy, p0); + XT_MADD_S(p2, dxy, p1); + XT_MADD_S(p3, dxy, p2); + XT_MADD_S(p4, dxy, p3); + XT_MADD_S(p5, dxy, p4); + XT_MADD_S(p6, dxy, p5); + z0 = p6; + /* apply integer part */ + e0 = XT_TRUNC_SX2(xy, 0); + e0 = AE_MAX32(e0, AE_L32_I((ae_int32 *)c_tbl, 7 * 4)); + e0 = AE_MIN32(e0, AE_L32_X((ae_int32 *)c_tbl, 8 * 4)); + e0 = AE_ADD32(e0, AE_L32_X((ae_int32 *)c_tbl, 8 * 4)); + ex0 = AE_SRAI32(e0, 1); + e0 = AE_SUB32(e0, ex0); + ex0 = AE_SLLI32(ex0, 23); + e0 = AE_SLLI32(e0, 23); + c0 = XT_AE_MOVXTFLOATX2_FROMINT32X2(e0); + c1 = XT_AE_MOVXTFLOATX2_FROMINT32X2(ex0); + z0 = XT_MUL_S(z0, c1); + z0 = XT_MUL_S(z0, c0); //!!!!!!!!!!!! + + + /* Take sign of x and y */ + { + xtbool2 bsx, bsy, b_yint, b_e0, b0, b_notspec; + + xtbool2 b_yeqz, b_yinf, b_xeqz, b_xeq1, b_xinf; + xtbool2 b_NaN1, b_NaN2, b_one, b_Inf, b_zero; + uint32_t b0i, b1i; + uint32_t yeqz, yinf, xeqz, xeq1, xinf, sx, sy, yint; + uint32_t one, NaN1, Inf, zero; + xtfloat xabs, spec; + ae_int32 sgn, zi0; + + x0 = XT_LSI((const xtfloat*)x, 0); + y0 = XT_LSI((const xtfloat*)y, 0); + xi0 = XT_AE_MOVINT32X2_FROMXTFLOATX2(x0); + bsx = (XT_OLT_S(x0, (xtfloat)0.0f)); + bsy = (XT_OLT_S(y0, (xtfloat)0.0f)); + + xabs = XT_ABS_S(x0); + /* check if y is integer */ + t0 = XT_FITRUNC_S(y0); + b_yint = (XT_OEQ_S(t0, y0)); + + /* check if y is odd */ + e0 = XT_TRUNC_S(y0, 0); //temp0 + b_e0 = (AE_EQ32(e0, MAX_INT32));//~b_tmp0 + b0i = AE_MOVAB2(b_e0); + b1i = AE_MOVAB2(b_yint); + b0i = b1i&(~b0i); + b0 = AE_MOVBA2(b0i); + AE_MOVF_32(e0, AE_ZERO32(), xtbool2_extract_0(b0)); + e0 = AE_SLLI32(e0, 31); + sgn = AE_AND32(e0, xi0); + /* process special numbers */ + b_yeqz = (XT_OEQ_S((xtfloatx2)0.0f, y0)); /* y ==0 */ + b_yinf = (XT_OEQ_S(XT_ABS_SX2(y0), xa_nnlib_pow_plusInff.f)); /* |y|==Inf */ + b_xeqz = (XT_OEQ_S(x0, (xtfloatx2)0.0f)); /* x ==0 */ + b_xeq1 = (XT_OEQ_S(xabs, (xtfloatx2)1.0f)); /* |x|==1 */ + b_xinf = (XT_OEQ_S(xabs, xa_nnlib_pow_plusInff.f)); /* |x|==INF */ + + yint = AE_MOVAB2 (b_yint); + yeqz = AE_MOVAB2 (b_yeqz); + yinf = AE_MOVAB2 (b_yinf); + xeqz = AE_MOVAB2 (b_xeqz); + xeq1 = AE_MOVAB2 (b_xeq1); + xinf = AE_MOVAB2 (b_xinf); + sx = AE_MOVAB2 (bsx); + sy = AE_MOVAB2 (bsy); + + one = xeq1 & (yinf | (~sx)); /* |x|==1 && ( |y|==Inf || x>0 ) */ + one = one | yeqz; /* ( |x|==1 && ( |y|==Inf || x>0 ) ) || y==0 --> z=1.0 */ + NaN1 = sx&(~yint); /* x<0 && y is not an integer --> z=NaN */ + Inf = xinf&(~sy); /* x==INF && y>0 --> z=INF */ + Inf = Inf | (xeqz & sy); /* x==0 && y<0 --> z=INF */ + zero = xeqz &(~sy); /* x==0 && y>0 --> z=0.0 */ + zero = zero | (xinf & sy); /* x==INF && y<0 --> z=0.0 */ + + b_NaN1 = AE_MOVBA2(NaN1); + b_NaN2 = XT_UN_SX2(x0, y0); /* isnan(x) || isnan(y) --> z=NaN */ + b_one = AE_MOVBA2(one); + b_Inf = AE_MOVBA2(Inf); + b_zero = AE_MOVBA2(zero); + + /* Save special numbers and mask for special numbers */ + spec = (xtfloat)xa_nnlib_pow_qNaNf.f; + XT_MOVF_S(spec, 0.5f, xtbool2_extract_0(b_NaN1)); + XT_MOVT_S(spec, 0.0f, xtbool2_extract_0(b_zero)); + XT_MOVT_S(spec, xa_nnlib_pow_plusInff.f, xtbool2_extract_0(b_Inf)); + XT_MOVT_S(spec, xa_nnlib_pow_qNaNf.f, xtbool2_extract_0(b_NaN2)); + XT_MOVT_S(spec, 1.0f, xtbool2_extract_0(b_one)); + + b_notspec = XT_OEQ_S(spec, 0.5f); + /* Replace result with special numbers if needed */ + XT_MOVF_S(z0, spec, xtbool2_extract_0(b_notspec)); + /* Restore sign and store result */ + zi0 = XT_AE_MOVINT32X2_FROMXTFLOATX2(z0); + zi0 = AE_XOR32(zi0, sgn); + z0 = XT_AE_MOVXTFLOATX2_FROMINT32X2(zi0); + + XT_SSI(z0,(xtfloat*)z,0); + + } + } + +} /* vec_powf() */ +#else +#define sz_f32 (int)sizeof(FLOAT32) +void xa_nn_elm_pow_f32(FLOAT32 * restrict z, + const FLOAT32 * restrict x, + const FLOAT32 * restrict y, + int N) +{ + + const int blkSizef = MAX_ALLOCA_SZ / sz_f32; + /* Allocate a fixed-size scratch area on the stack. */ + float ALIGN(16) scr[blkSizef]; + /* Table of different constants used in computations */ + static const int32_t c_tbl[] = + { + -126, + -150, + (int32_t)0x007FFFFF,/* max denormalized floating-point number / mantissa mask */ + (int32_t)0x4B800000,/* 2^24 */ + (int32_t)0x3F3504F3,/* sqrt(0.5) */ + (int32_t)0x3F000000,/* 0.5 */ + (int32_t)0xBF000000,/* -0.5 */ + -252, + 254 + }; + int n; + const xtfloat * pX; + const xtfloat * pY; + + const xtfloat * restrict S_rd; + xtfloat * restrict S_wr; + xtfloat * restrict pZ; + const ae_int32 * restrict TBL; + const xtfloat * restrict TBL_LOG2; + const xtfloat * restrict TBL_POW2; + xtfloat x0, y0, z0, t0, t1, ef0; + xtfloat c2f, c3f, c4f; + xtfloat _0, _1, half; + ae_int32x2 c0i, c1i, c5i, c6i, c7i, c8i; + ae_int32 e0, xi0, yi0, ex0; + xtbool bsx, bsy, bdenorm, bsmall; + + /* overall number of blocks; number of values in the current block */ + int blkLen; + /* Block size, blkLen <= blkSize */ + const int blkSize = MAX_ALLOCA_SZ / (3 * sz_f32); + + + if (N <= 0) return; + + NASSERT_ALIGN16(scr); + + /* + * Data are processed in blocks of scratch area size. Further, the algorithm + * implementation is splitted in order to feed the optimizing compiler with a + * few loops of managable size. + */ + + blkLen = 0; + TBL = (const ae_int32 *)c_tbl; + for (; N>0; N -= blkLen, x += blkSize, y += blkSize, z += blkSize) + { + blkLen = XT_MIN(N, blkSize); + _0 = 0.0f; + _1 = (1.0f); + half = (0.5f); + { + pX = (const xtfloat*)x; + S_wr = ( xtfloat*)scr; + + for (n = 0; n<(blkLen); n++) + { + XT_LSIP(x0, pX, sz_f32); + + x0 = XT_ABS_S(x0); + c0i = AE_L32_I(TBL, 0 * 4); /* -126 */ + c1i = AE_L32_I(TBL, 1 * 4); /* -150 */ + c2f = XT_LSI((xtfloat*)TBL, 2 * 4); + c3f = XT_LSI((xtfloat*)TBL, 3 * 4); + /* process denormalized values */ + bdenorm = XT_OLE_S(x0, c2f); + t0 = XT_MUL_S(x0, c3f); + XT_MOVT_S(x0, t0, bdenorm); + e0 = c0i; + + AE_MOVT_32(e0, c1i, bdenorm); + /* extract exponent */ + xi0 = XT_RFR(x0); + ex0 = AE_SRLI32(xi0, 23); + e0 = AE_ADD32(e0, ex0); + /* extract mantissa */ + ex0 = XT_RFR(c2f);/* load mantissa mask */ //!!!!!!!!!!!!! + c5i = AE_L32_I(TBL, 5 * 4);/* 0.5 */ + xi0 = AE_AND32(xi0, ex0); + xi0 = AE_OR32(xi0, c5i); + x0 = XT_WFR(xi0); + /* adjust the mantissa to range [ sqrt(0.5) ; sqrt(2.0) ) */ + c4f = XT_LSI((xtfloat*)TBL, 4 * 4); + bsmall = XT_OLT_S(x0, c4f); + t0 = XT_ADD_S(x0, x0); + ex0 = AE_SUB32(e0, 1); + XT_MOVT_S(x0, t0, bsmall); + AE_MOVT_32(e0, ex0, bsmall); + x0 = XT_SUB_S(_1, x0); //!!! + ef0 = XT_FLOAT_S(e0, 0); //!!! + XT_SSIP(x0, S_wr, sz_f32); + XT_SSIP(ef0, S_wr, 2 * sz_f32); + + } + } + __Pragma("no_reorder"); + /* */ + { + xtfloat p0, p1, p2, p3, p4, p5, p6, p7, p8, p9; + xtfloat p10, p11, p12, p13; + xtfloat t2, w0, w1; + S_wr = ( xtfloat*)scr + 2; + S_rd = (const xtfloat*)scr; + TBL_LOG2 = (const xtfloat *)xa_nnlib_log2f_coef; + + for (n = 0; n<(blkLen); n++) + { + XT_LSIP(x0, S_rd, 3*sz_f32); + + /* evaluate polynomial approximation */ + /* Load table of coefficients */ + + p0 = XT_LSI(TBL_LOG2, 0 * 4); + p1 = XT_LSI(TBL_LOG2, 1 * 4); + p2 = XT_LSI(TBL_LOG2, 2 * 4); + p3 = XT_LSI(TBL_LOG2, 3 * 4); + p4 = XT_LSI(TBL_LOG2, 4 * 4); + p5 = XT_LSI(TBL_LOG2, 5 * 4); + p6 = XT_LSI(TBL_LOG2, 6 * 4); + p7 = XT_LSI(TBL_LOG2, 7 * 4); + p8 = XT_LSX(TBL_LOG2, 8 * 4); + p9 = XT_LSX(TBL_LOG2, 9 * 4); + + XT_MADD_S(p1, x0, p0); + XT_MADD_S(p2, x0, p1); + XT_MADD_S(p3, x0, p2); + XT_MADD_S(p4, x0, p3); + XT_MADD_S(p5, x0, p4); + XT_MADD_S(p6, x0, p5); + XT_MADD_S(p7, x0, p6); + XT_MADD_S(p8, x0, p7); + XT_MADD_S(p9, x0, p8); + t2 = p9; + XT_SSIP(t2, S_wr, 3 * sz_f32); + } + S_wr = ( xtfloat*)scr; + S_rd = (const xtfloat*)scr; + + for (n = 0; n<(blkLen); n++) + { + p10 = XT_LSX(TBL_LOG2, 10 * 4); + p11 = XT_LSX(TBL_LOG2, 11 * 4); + p12 = XT_LSX(TBL_LOG2, 12 * 4); + p13 = XT_LSX(TBL_LOG2, 13 * 4); + + XT_LSIP(x0, S_rd, sz_f32); + XT_LSIP(ef0, S_rd, sz_f32); + XT_LSIP(t2, S_rd, sz_f32); + + /* next coefficients are computed in extended precision */ + t0 = XT_MUL_S(x0, t2); t1 = t0; + XT_MSUB_S(t1, x0, t2); + w0 = XT_ADD_S(t0, p10); + w1 = XT_SUB_S(w0, p10); + w1 = XT_SUB_S(t0, w1); + w1 = XT_SUB_S(w1, t1); + t0 = w0; t1 = w1; + w0 = XT_MUL_S(x0, t0); w1 = w0; + XT_MSUB_S(w1, x0, t0); t0 = w0; + XT_MSUB_S(w1, x0, t1); t1 = w1; + w0 = XT_ADD_S(t0, p11); + w1 = XT_SUB_S(w0, p11); + w1 = XT_SUB_S(t0, w1); + w1 = XT_SUB_S(w1, t1); + t0 = w0; t1 = w1; + x0 = XT_NEG_S(x0); + w0 = XT_MUL_S(x0, t0); w1 = w0; + XT_MSUB_S(w1, x0, t0); t0 = w0; + XT_MSUB_S(w1, x0, t1); t1 = w1; + /* multiply by log2(e) */ + w0 = XT_MUL_S(t0, p12); w1 = w0; + XT_MSUB_S(w1, t0, p12); + XT_MADD_S(w1, t1, p12); + XT_MSUB_S(w1, t0, p13); + t0 = w0; t1 = w1; + /* add exponent */ + w0 = XT_ADD_S(t0, ef0); + w1 = XT_SUB_S(w0, ef0); + w1 = XT_SUB_S(t0, w1); + t1 = XT_SUB_S(w1, t1);//!!!! + t0 = w0; // !!!!! + XT_SSIP(t0, S_wr, sz_f32); + XT_SSIP(t1, S_wr, sz_f32); + } + } + __Pragma("no_reorder"); + /* */ + { + xtfloat xy, dxy, c0, c1, _m1;; + xtfloat p0, p1, p2, p3, p4, p5, p6; + S_wr = ( xtfloat*)scr; + S_rd = (const xtfloat*)scr; + TBL_POW2 = (const xtfloat *)xa_nnlib_pow2f_coef; + pY = (const xtfloat*)y; + _m1 = -1.0f; + for (n = 0; n<(blkLen); n++) + { + XT_LSIP(t0, S_rd, sz_f32); + XT_LSIP(t1, S_rd, sz_f32); + XT_LSIP(y0, pY, sz_f32); + /* compute y*log2(x) and separate result into integer and fractional parts */ + xy = XT_FLOAT_S(XT_ROUND_S(XT_MUL_S(y0, t0), 0), 0); + dxy = XT_NEG_S(xy); + XT_MADD_S(dxy, y0, t0); + XT_MADD_S(dxy, y0, t1); + c5i = AE_L32_I(TBL, 5 * 4);/* 0.5 */ + c6i = AE_L32_I(TBL, 6 * 4);/* -0.5 */ + dxy = XT_MIN_S(dxy, _1); + dxy = XT_MAX_S(dxy, _m1); + /* compute 2^fract */ + p0 = XT_LSI(TBL_POW2, 0 * 4); + p1 = XT_LSI(TBL_POW2, 1 * 4); + p2 = XT_LSI(TBL_POW2, 2 * 4); + p3 = XT_LSI(TBL_POW2, 3 * 4); + p4 = XT_LSI(TBL_POW2, 4 * 4); + p5 = XT_LSI(TBL_POW2, 5 * 4); + p6 = XT_LSI(TBL_POW2, 6 * 4); + /* NOTE: do not change the order of computations and way of polynomial decomposition ! */ + XT_MADD_S(p1, dxy, p0); + XT_MADD_S(p2, dxy, p1); + XT_MADD_S(p3, dxy, p2); + XT_MADD_S(p4, dxy, p3); + XT_MADD_S(p5, dxy, p4); + XT_MADD_S(p6, dxy, p5); + z0 = p6; + /* apply integer part */ + e0 = XT_TRUNC_S(xy, 0); + c7i = AE_L32_I(TBL, 7 * 4);/* -252 */ + c8i = AE_L32_X(TBL, 8 * 4);/* 254 */ + e0 = AE_MAX32(e0, c7i); + e0 = AE_MIN32(e0, c8i); + e0 = AE_ADD32(e0, c8i); + ex0 = AE_SRAI32(e0, 1); + e0 = AE_SUB32(e0, ex0); + ex0 = AE_SLLI32(ex0, 23); + e0 = AE_SLLI32(e0, 23); + + c0 = XT_WFR(e0); + c1 = XT_WFR(ex0); + z0 = XT_MUL_S(z0, c1); + z0 = XT_MUL_S(z0, c0); //!!!!!!!!!!!! + XT_SSIP(z0, S_wr, sz_f32); + + } + } + __Pragma("no_reorder"); + /* */ + { + xtbool b_yint, b_e0, b0, b_notspec; + xtbool b_yeqz, b_yinf, b_xeqz, b_xeq1, b_xinf; + xtbool b_NaN1, b_NaN2, b_one, b_Inf, b_zero; + uint32_t b0i, b1i; + uint32_t yeqz, yinf, xeqz, xeq1, xinf, sx, sy, yint; + uint32_t one, NaN1, Inf, zero; + xtfloat xabs, spec; + ae_int32x2 sgn, zi0; + + S_rd = (const xtfloat*)scr; + pY = (const xtfloat*)y; + pX = (const xtfloat*)x; + pZ = (xtfloat*)z; + + for (n = 0; n<(blkLen); n++) + { + XT_LSIP(z0, S_rd, sz_f32); + XT_LSIP(x0, pX, sz_f32); + XT_LSIP(y0, pY, sz_f32); + + /* Take sign of x and y */ + xi0 = XT_RFR(x0); + yi0 = XT_RFR(y0); + bsx = XT_OLT_S(x0, (xtfloat)0.0f); + bsy = XT_OLT_S(y0, (xtfloat)0.0f); + + xabs = XT_ABS_S(x0); + /* check if y is integer */ + { /* validate if y is integral - all numbers bigger than 2^23 are assumed as integral */ + xtfloat t, c; + t = XT_ABS_S((xtfloat)y0); + c = 8388608.f; + XT_MOVT_S(c, t, XT_ULT_S(t, 8388608.f)); + t = c; + t0 = XT_FLOAT_S(XT_TRUNC_S(t, 0), 0); + b_yint = XT_OEQ_S(XT_FLOAT_S(XT_TRUNC_S(t, 0), 0), t); + } + + /* check if y is odd */ + e0 = XT_TRUNC_S(y0, 0); //temp0 + b_e0 = xtbool2_extract_0(AE_EQ32(e0, MAX_INT32));//~b_tmp0 + b0i = AE_MOVAB(b_e0); + b1i = AE_MOVAB(b_yint); + b0i = b1i&(~b0i); + b0 = AE_MOVBA(b0i); + AE_MOVF_32(e0, AE_ZERO32(), b0); + e0 = AE_SLLI32(e0, 31); + sgn = AE_AND32(e0, xi0); + /* process special numbers */ + b_yeqz = XT_OEQ_S((xtfloat)0.0f, y0); /* y ==0 */ + b_yinf = XT_OEQ_S(XT_ABS_S(y0), xa_nnlib_pow_plusInff.f); /* |y|==Inf */ + b_xeqz = XT_OEQ_S(x0, (xtfloat)0.0f); /* x ==0 */ + b_xeq1 = XT_OEQ_S(xabs, (xtfloat)1.0f); /* |x|==1 */ + b_xinf = XT_OEQ_S(xabs, xa_nnlib_pow_plusInff.f); /* |x|==INF */ + + yint = AE_MOVAB(b_yint); + yeqz = AE_MOVAB(b_yeqz); + yinf = AE_MOVAB(b_yinf); + xeqz = AE_MOVAB(b_xeqz); + xeq1 = AE_MOVAB(b_xeq1); + xinf = AE_MOVAB(b_xinf); + sx = AE_MOVAB(bsx); + sy = AE_MOVAB(bsy); + one = xeq1 & (yinf | (~sx)); /* |x|==1 && ( |y|==Inf || x>0 ) */ + one = one | yeqz; /* ( |x|==1 && ( |y|==Inf || x>0 ) ) || y==0 --> z=1.0 */ + NaN1 = sx&(~yint); /* x<0 && y is not an integer --> z=NaN */ + Inf = xinf&(~sy); /* x==INF && y>0 --> z=INF */ + Inf = Inf | (xeqz & sy); /* x==0 && y<0 --> z=INF */ + zero = xeqz &(~sy); /* x==0 && y>0 --> z=0.0 */ + zero = zero | (xinf & sy); /* x==INF && y<0 --> z=0.0 */ + + b_NaN1 = AE_MOVBA(NaN1); + b_NaN2 = XT_UN_S(x0, y0); /* isnan(x) || isnan(y) --> z=NaN */ + b_one = AE_MOVBA(one); + b_Inf = AE_MOVBA(Inf); + b_zero = AE_MOVBA(zero); + + /* Save special numbers and mask for special numbers */ + spec = (xtfloat)xa_nnlib_pow_qNaNf.f; + XT_MOVF_S(spec, half, b_NaN1); + XT_MOVT_S(spec, _0, b_zero); + XT_MOVT_S(spec, xa_nnlib_pow_plusInff.f, b_Inf); + XT_MOVT_S(spec, xa_nnlib_pow_qNaNf.f, b_NaN2); + XT_MOVT_S(spec, _1, b_one); + + b_notspec = XT_OEQ_S(spec, half); + /* Replace result with special numbers if needed */ + XT_MOVF_S(z0, spec, b_notspec); + /* Restore sign and store result */ + zi0 = XT_RFR(z0); + zi0 = AE_XOR32(zi0, sgn); + z0 = XT_WFR(zi0); + XT_SSIP(z0, pZ, sz_f32); + } + } + } + +} /* vec_powf() */ +#endif From 1dab7a9ff145cd74024388963c15c888752b7958 Mon Sep 17 00:00:00 2001 From: dijopaul <87994875+dijopaul@users.noreply.github.com> Date: Wed, 27 Nov 2024 21:28:10 +0530 Subject: [PATCH 08/27] Upgrade nnlib to latest 4.2.0 (#7105) Upgrading nnlib to latest version 4.2.0 --- backends/cadence/hifi/third-party/nnlib/nnlib-hifi4 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/backends/cadence/hifi/third-party/nnlib/nnlib-hifi4 b/backends/cadence/hifi/third-party/nnlib/nnlib-hifi4 index 6a9ea45e23..102944a6f7 160000 --- a/backends/cadence/hifi/third-party/nnlib/nnlib-hifi4 +++ b/backends/cadence/hifi/third-party/nnlib/nnlib-hifi4 @@ -1 +1 @@ -Subproject commit 6a9ea45e23ef591fe207442df33a5ebe88bbe8de +Subproject commit 102944a6f76a0de4d81adc431f3f132f517aa87f From d136206861a8d00c61475d133f4a3e9634b12bb7 Mon Sep 17 00:00:00 2001 From: ckmadhira Date: Wed, 27 Nov 2024 21:38:29 +0530 Subject: [PATCH 09/27] For broadcast, added support to process distinct input dimensions (#7107) For broadcast, added support for distinct dimensions for both the inputs. Also, added support for processing dimension size more than 5. Signed-off-by: cmadhira@cadence.com Co-authored-by: cmadhira@cadence.com --- .../cadence/fusion_g3/operators/op_add.cpp | 48 +++++++++++++------ .../cadence/fusion_g3/operators/op_mul.cpp | 48 +++++++++++++------ 2 files changed, 66 insertions(+), 30 deletions(-) diff --git a/backends/cadence/fusion_g3/operators/op_add.cpp b/backends/cadence/fusion_g3/operators/op_add.cpp index 6dc710ce6e..551c6652f1 100644 --- a/backends/cadence/fusion_g3/operators/op_add.cpp +++ b/backends/cadence/fusion_g3/operators/op_add.cpp @@ -76,27 +76,45 @@ Tensor& add_out( int inp2_shape[kTensorDimensionLimit]; int out_shape[kTensorDimensionLimit]; - /* input shapes and output shapes */ - for (auto i = 0; i < a_size.size(); i++) { - inp1_shape[i] = a_size[i]; - } - - for (auto i = 0; i < b_size.size(); i++) { - inp2_shape[i] = b_size[i]; - } - - for (auto i = 0; i < out_size.size(); i++) { - out_shape[i] = out_size[i]; - } - /*find broadcast*/ const bool a_is_broadcasted = !out.sizes().equals(a.sizes()); const bool b_is_broadcasted = !out.sizes().equals(b.sizes()); const bool broadcast = (a_is_broadcasted || b_is_broadcasted); int max_dim = a.dim() > b.dim() ? a.dim() : b.dim(); + max_dim = out.dim() > max_dim ? out.dim() : max_dim; - if (compute_type == ScalarType::Int) { + bool optimized = 1; + + if ((a.dim() == 0) || (b.dim() == 0)) { + optimized = 0; + } + + if ((broadcast == 1) && (max_dim > kTensorDimensionLimit)) { + optimized = 0; + } + + for (int i = 0; i < max_dim; i++) { + out_shape[i] = 1; + inp1_shape[i] = 1; + inp2_shape[i] = 1; + } + + int offset_out = max_dim - out.dim(); + int offset_inp1 = max_dim - a.dim(); + int offset_inp2 = max_dim - b.dim(); + + for (int i = 0; i < out.dim(); i++) { + out_shape[i + offset_out] = out.size(i); + } + for (int i = 0; i < a.dim(); i++) { + inp1_shape[i + offset_inp1] = a.size(i); + } + for (int i = 0; i < b.dim(); i++) { + inp2_shape[i + offset_inp2] = b.size(i); + } + + if ((compute_type == ScalarType::Int) && (optimized)){ const int* const inp1_data = a.const_data_ptr(); const int* const inp2_data = b.const_data_ptr(); int* const out_data = out.mutable_data_ptr(); @@ -117,7 +135,7 @@ Tensor& add_out( xa_nn_elm_add_32x32_32( out_data, inp1_data, inp2_data, alpha_val, out.numel()); } - } else if (compute_type == ScalarType::Float) { + } else if ((compute_type == ScalarType::Float) && (optimized)) { const float* const inp1_data = a.const_data_ptr(); const float* const inp2_data = b.const_data_ptr(); float* const out_data = out.mutable_data_ptr(); diff --git a/backends/cadence/fusion_g3/operators/op_mul.cpp b/backends/cadence/fusion_g3/operators/op_mul.cpp index 366982ae3f..82e84bdbe1 100644 --- a/backends/cadence/fusion_g3/operators/op_mul.cpp +++ b/backends/cadence/fusion_g3/operators/op_mul.cpp @@ -68,27 +68,45 @@ Tensor& mul_out( int inp2_shape[kTensorDimensionLimit]; int out_shape[kTensorDimensionLimit]; - /* input shapes and output shapes */ - for (auto i = 0; i < a_size.size(); i++) { - inp1_shape[i] = a_size[i]; - } - - for (auto i = 0; i < b_size.size(); i++) { - inp2_shape[i] = b_size[i]; - } - - for (auto i = 0; i < out_size.size(); i++) { - out_shape[i] = out_size[i]; - } - /*find broadcast*/ const bool a_is_broadcasted = !out.sizes().equals(a.sizes()); const bool b_is_broadcasted = !out.sizes().equals(b.sizes()); const bool broadcast = (a_is_broadcasted || b_is_broadcasted); int max_dim = a.dim() > b.dim() ? a.dim() : b.dim(); + max_dim = out.dim() > max_dim ? out.dim() : max_dim; - if (compute_type == ScalarType::Int) { + bool optimized = 1; + + if ((a.dim() == 0) || (b.dim() == 0)) { + optimized = 0; + } + + if ((broadcast == 1) && (max_dim > kTensorDimensionLimit)) { + optimized = 0; + } + + for (int i = 0; i < max_dim; i++) { + out_shape[i] = 1; + inp1_shape[i] = 1; + inp2_shape[i] = 1; + } + + int offset_out = max_dim - out.dim(); + int offset_inp1 = max_dim - a.dim(); + int offset_inp2 = max_dim - b.dim(); + + for (int i = 0; i < out.dim(); i++) { + out_shape[i + offset_out] = out.size(i); + } + for (int i = 0; i < a.dim(); i++) { + inp1_shape[i + offset_inp1] = a.size(i); + } + for (int i = 0; i < b.dim(); i++) { + inp2_shape[i + offset_inp2] = b.size(i); + } + + if ((compute_type == ScalarType::Int) && (optimized)) { const int* const inp1_data = a.const_data_ptr(); const int* const inp2_data = b.const_data_ptr(); int* const out_data = out.mutable_data_ptr(); @@ -105,7 +123,7 @@ Tensor& mul_out( } else { xa_nn_elm_mul_32x32_32(out_data, inp1_data, inp2_data, out.numel()); } - } else if (compute_type == ScalarType::Float) { + } else if ((compute_type == ScalarType::Float) && (optimized)) { const float* const inp1_data = a.const_data_ptr(); const float* const inp2_data = b.const_data_ptr(); float* const out_data = out.mutable_data_ptr(); From a2619e1dae77af9448b53da5bf9f342e3aa9cc0a Mon Sep 17 00:00:00 2001 From: David Lin Date: Wed, 27 Nov 2024 10:35:10 -0800 Subject: [PATCH 10/27] Fix lints from HUD (#7110) run lintrunner Co-authored-by: lind --- .../cadence/fusion_g3/operators/op_add.cpp | 12 +++---- .../cadence/fusion_g3/operators/op_mul.cpp | 10 +++--- .../cadence/hifi/operators/op_maximum.cpp | 1 - backends/cadence/hifi/operators/op_pow.cpp | 1 - .../hifi/operators/quantized_linear_out.cpp | 31 +++++++++---------- 5 files changed, 26 insertions(+), 29 deletions(-) diff --git a/backends/cadence/fusion_g3/operators/op_add.cpp b/backends/cadence/fusion_g3/operators/op_add.cpp index 551c6652f1..9537cbacb7 100644 --- a/backends/cadence/fusion_g3/operators/op_add.cpp +++ b/backends/cadence/fusion_g3/operators/op_add.cpp @@ -95,15 +95,15 @@ Tensor& add_out( } for (int i = 0; i < max_dim; i++) { - out_shape[i] = 1; + out_shape[i] = 1; inp1_shape[i] = 1; inp2_shape[i] = 1; } - - int offset_out = max_dim - out.dim(); + + int offset_out = max_dim - out.dim(); int offset_inp1 = max_dim - a.dim(); int offset_inp2 = max_dim - b.dim(); - + for (int i = 0; i < out.dim(); i++) { out_shape[i + offset_out] = out.size(i); } @@ -111,10 +111,10 @@ Tensor& add_out( inp1_shape[i + offset_inp1] = a.size(i); } for (int i = 0; i < b.dim(); i++) { - inp2_shape[i + offset_inp2] = b.size(i); + inp2_shape[i + offset_inp2] = b.size(i); } - if ((compute_type == ScalarType::Int) && (optimized)){ + if ((compute_type == ScalarType::Int) && (optimized)) { const int* const inp1_data = a.const_data_ptr(); const int* const inp2_data = b.const_data_ptr(); int* const out_data = out.mutable_data_ptr(); diff --git a/backends/cadence/fusion_g3/operators/op_mul.cpp b/backends/cadence/fusion_g3/operators/op_mul.cpp index 82e84bdbe1..31cd50314e 100644 --- a/backends/cadence/fusion_g3/operators/op_mul.cpp +++ b/backends/cadence/fusion_g3/operators/op_mul.cpp @@ -87,15 +87,15 @@ Tensor& mul_out( } for (int i = 0; i < max_dim; i++) { - out_shape[i] = 1; + out_shape[i] = 1; inp1_shape[i] = 1; inp2_shape[i] = 1; } - - int offset_out = max_dim - out.dim(); + + int offset_out = max_dim - out.dim(); int offset_inp1 = max_dim - a.dim(); int offset_inp2 = max_dim - b.dim(); - + for (int i = 0; i < out.dim(); i++) { out_shape[i + offset_out] = out.size(i); } @@ -103,7 +103,7 @@ Tensor& mul_out( inp1_shape[i + offset_inp1] = a.size(i); } for (int i = 0; i < b.dim(); i++) { - inp2_shape[i + offset_inp2] = b.size(i); + inp2_shape[i + offset_inp2] = b.size(i); } if ((compute_type == ScalarType::Int) && (optimized)) { diff --git a/backends/cadence/hifi/operators/op_maximum.cpp b/backends/cadence/hifi/operators/op_maximum.cpp index f9a3658891..f85d3470e9 100644 --- a/backends/cadence/hifi/operators/op_maximum.cpp +++ b/backends/cadence/hifi/operators/op_maximum.cpp @@ -23,7 +23,6 @@ using torch::executor::apply_binary_elementwise_fn; using torch::executor::Error; using torch::executor::resize_to_broadcast_target_size; - namespace cadence { namespace impl { namespace HiFi { diff --git a/backends/cadence/hifi/operators/op_pow.cpp b/backends/cadence/hifi/operators/op_pow.cpp index 9669e96123..1399c24a34 100644 --- a/backends/cadence/hifi/operators/op_pow.cpp +++ b/backends/cadence/hifi/operators/op_pow.cpp @@ -351,4 +351,3 @@ Tensor& pow_Scalar_out( } // namespace HiFi } // namespace impl } // namespace cadence - diff --git a/backends/cadence/hifi/operators/quantized_linear_out.cpp b/backends/cadence/hifi/operators/quantized_linear_out.cpp index accc610132..b8e1d117fb 100644 --- a/backends/cadence/hifi/operators/quantized_linear_out.cpp +++ b/backends/cadence/hifi/operators/quantized_linear_out.cpp @@ -26,8 +26,7 @@ using ::executorch::aten::Tensor; using ::executorch::runtime::getLeadingDims; using ::executorch::runtime::KernelRuntimeContext; - - // The nnlib kernel to compute quantized linear via matmul. +// The nnlib kernel to compute quantized linear via matmul. void _quantized_linear_asym8u( const Tensor& in, @@ -48,22 +47,22 @@ void _quantized_linear_asym8u( const int32_t* __restrict__ bias_data = bias.const_data_ptr(); uint8_t* __restrict__ out_data = out.mutable_data_ptr(); int32_t ret = xa_nn_matmul_asym8uxasym8u_asym8u( - out_data, - weight_data, - in_data, - bias_data, - out_dim, - in_dim, - in_dim, - leading_dims, - in_dim, - out_dim, - 1, + out_data, + weight_data, + in_data, + bias_data, + out_dim, + in_dim, + in_dim, + leading_dims, + in_dim, + out_dim, + 1, -weight_zero_point.const_data_ptr()[0], // mat1_zero_bias -in_zero_point, // mat2_zero_bias - out_multiplier.const_data_ptr()[0], - out_shift.const_data_ptr()[0], - out_zero_point); + out_multiplier.const_data_ptr()[0], + out_shift.const_data_ptr()[0], + out_zero_point); ET_DCHECK_MSG(ret == 0, "HiFi quantized::linear failed"); } From 8b375f25332527a4fb8385839afb01f9d9df260d Mon Sep 17 00:00:00 2001 From: Hansong <107070759+kirklandsign@users.noreply.github.com> Date: Wed, 27 Nov 2024 11:04:14 -0800 Subject: [PATCH 11/27] Fix pyre in arm_backend.py Differential Revision: D66475070 Pull Request resolved: https://github.com/pytorch/executorch/pull/7069 --- backends/arm/arm_backend.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/backends/arm/arm_backend.py b/backends/arm/arm_backend.py index 59473a9e6d..c59eedc304 100644 --- a/backends/arm/arm_backend.py +++ b/backends/arm/arm_backend.py @@ -135,7 +135,9 @@ def set_quantize_io(self, quantize_io: bool = False) -> "ArmCompileSpecBuilder": self.quantize_io = quantize_io return self - def set_input_order(self, input_order: str = None) -> "ArmCompileSpecBuilder": + def set_input_order( + self, input_order: Optional[str] = None + ) -> "ArmCompileSpecBuilder": """ Reorder the inputs coming in. This may be required when inputs > 1. And while using the U55/U85 CompileSpec. From 27638c33e7e70d15eb4b495a860c43d5f37c4dae Mon Sep 17 00:00:00 2001 From: pytorchbot Date: Wed, 27 Nov 2024 12:02:10 -0800 Subject: [PATCH 12/27] move rope related logic together (#7113) Pull Request resolved: https://github.com/pytorch/executorch/pull/6560 Right now, rope related code scatters around a few different places in `llama_transformer`. It makes it hard to make changes to rope related things. This PR moves all rope related logic into its own module. ghstack-source-id: 255543205 Differential Revision: [D65173598](https://our.internmc.facebook.com/intern/diff/D65173598/) Co-authored-by: Lunwen He --- examples/models/llama/llama_transformer.py | 139 +++++++++++------- .../llama/source_transformation/rope.py | 28 ++-- 2 files changed, 101 insertions(+), 66 deletions(-) diff --git a/examples/models/llama/llama_transformer.py b/examples/models/llama/llama_transformer.py index 3f8b8dd654..10d660d37a 100644 --- a/examples/models/llama/llama_transformer.py +++ b/examples/models/llama/llama_transformer.py @@ -147,6 +147,81 @@ def __post_init__(self): self.head_dim = self.dim // self.n_heads +class Rope(torch.nn.Module): + def __init__(self, params: ModelArgs): + super().__init__() + self.params = params + if self.params.use_hf_rope: + self.precompute_freqs_cis = hf_precompute_freqs_cis + else: + self.precompute_freqs_cis = partial( + precompute_freqs_cis, use_scaled=self.params.use_scaled_rope + ) + freqs_cos, freqs_sin = self.precompute_freqs_cis( + self.params.head_dim, + ( + self.params.max_seq_len # Normal llama2. + if self.params.ffn_dim_multiplier is None + else self.params.max_seq_len * 2 # Sharded checkpoint. + ), + self.params.rope_freq_base, + ) + self.register_buffer("freqs_cos", freqs_cos, persistent=False) + self.register_buffer("freqs_sin", freqs_sin, persistent=False) + if self.params.use_hf_rope: + self.apply_rotary_emb = hf_apply_rotary_emb + else: + self.apply_rotary_emb = RotaryEmbedding() + + def forward( + self, + q: torch.Tensor, + k: torch.Tensor, + freqs_cos: torch.Tensor, + freqs_sin: torch.Tensor, + ): + return self.apply_rotary_emb(q, k, freqs_cos, freqs_sin) + + def get_freqs(self, input_pos: Optional[torch.Tensor], seq_len: int): + """ + Get the precomputed frequencies for the given input position and sequence length. + + Args: + input_pos (torch.Tensor): The input position tensor. + seq_len (int): The sequence length. + + Returns: + Tuple[torch.Tensor, torch.Tensor]: The precomputed frequencies for the given input position and sequence length. + """ + if self.params.use_kv_cache: + assert ( + input_pos is not None + ), "input_pos must be provided when use_kv_cache is True" + + if self.params.enable_dynamic_shape: + # when KV cache is used, seqlen is most likely 1. We want to slice from the start_pos. + input_pos_item = input_pos[-1].item() + torch._check_is_size(input_pos_item) + torch._check(input_pos_item < self.params.max_seq_len) + # pyre-ignore: Incompatible parameter type [6]: torch.narrow does expect int or Tensor + freqs_cos = self.freqs_cos.narrow(0, input_pos_item, seq_len) + # pyre-ignore: Incompatible parameter type [6] + freqs_sin = self.freqs_sin.narrow(0, input_pos_item, seq_len) + else: + # When not using dynamic shape, use of the .item results in + # symints, due to querying the data from tensor. + # this path avoids that for mps backend, although probably mps backend + # can support dynamic shape? + freqs_cos = self.freqs_cos[input_pos] + freqs_sin = self.freqs_sin[input_pos] + + else: + assert input_pos is None, "input_pos is unused when use_kv_cache is False" + freqs_cos = self.freqs_cos[:seq_len] + freqs_sin = self.freqs_sin[:seq_len] + return freqs_cos, freqs_sin + + class KVCache(nn.Module): def __init__( self, @@ -266,7 +341,7 @@ def forward( class Attention(nn.Module): - def __init__(self, args: ModelArgs, layer_id: int): + def __init__(self, args: ModelArgs, layer_id: int, rope: Rope): super().__init__() self.use_kv_cache = args.use_kv_cache self.n_heads = args.n_heads @@ -287,6 +362,8 @@ def __init__(self, args: ModelArgs, layer_id: int): self.layer_id = layer_id + self.rope = rope + causal_mask = torch.tril( torch.ones( self.max_seq_len, @@ -303,7 +380,7 @@ def __init__(self, args: ModelArgs, layer_id: int): args.max_seq_len, self.n_kv_heads, self.head_dim, - not args.use_sdpa_with_kv_cache_op, # if we are using the custom op dont transpose the cache. Expect untransposed q k v + not args.use_sdpa_with_kv_cache_op, # if we are using the custom op don't transpose the cache. Expect untransposed q k v args.enable_dynamic_shape, ) self.SDPA = SDPA( @@ -314,10 +391,6 @@ def __init__(self, args: ModelArgs, layer_id: int): max_seq_len=self.max_seq_len, enable_dynamic_shape=args.enable_dynamic_shape, ) - if args.use_hf_rope: - self.apply_rotary_emb = hf_apply_rotary_emb - else: - self.apply_rotary_emb = RotaryEmbedding() def forward( self, @@ -336,7 +409,7 @@ def forward( v = v.view(bsz, seqlen, self.n_local_kv_heads, self.head_dim) # RoPE relative positional embeddings - q, k = self.apply_rotary_emb(q, k, freqs_cos, freqs_sin) + q, k = self.rope.forward(q, k, freqs_cos, freqs_sin) if self.use_kv_cache: assert input_pos is not None @@ -424,13 +497,13 @@ def forward(self, x: torch.Tensor) -> torch.Tensor: class TransformerBlock(nn.Module): - def __init__(self, layer_id: int, args: ModelArgs): + def __init__(self, layer_id: int, args: ModelArgs, rope: Rope): super().__init__() self.use_kv_cache = args.use_kv_cache self.n_heads = args.n_heads self.dim = args.dim self.head_dim = args.head_dim - self.attention = Attention(args, layer_id) + self.attention = Attention(args, layer_id, rope) if args.moe: self.block_sparse_moe = MOEFeedForward(args) else: @@ -459,9 +532,10 @@ def __init__(self, params: ModelArgs): self.n_layers = params.n_layers self.tok_embeddings = nn.Embedding(params.vocab_size, params.dim) + self.rope = Rope(params) self.layers = torch.nn.ModuleList() for layer_id in range(params.n_layers): - self.layers.append(TransformerBlock(layer_id, params)) + self.layers.append(TransformerBlock(layer_id, params, self.rope)) self.norm = RMSNorm(params.dim, eps=params.norm_eps) self.output = nn.Linear(params.dim, params.vocab_size, bias=False) self.use_kv_cache = params.use_kv_cache @@ -469,23 +543,6 @@ def __init__(self, params: ModelArgs): self.max_seq_len = params.max_seq_len self.input_prune_map = params.input_prune_map self.output_prune_map = params.output_prune_map - if params.use_hf_rope: - self.precompute_freqs_cis = hf_precompute_freqs_cis - else: - self.precompute_freqs_cis = partial( - precompute_freqs_cis, use_scaled=params.use_scaled_rope - ) - freqs_cos, freqs_sin = self.precompute_freqs_cis( - params.head_dim, - ( - params.max_seq_len # Normal llama2. - if params.ffn_dim_multiplier is None - else params.max_seq_len * 2 # Sharded checkpoint. - ), - params.rope_freq_base, - ) - self.register_buffer("freqs_cos", freqs_cos, persistent=False) - self.register_buffer("freqs_sin", freqs_sin, persistent=False) def forward( self, @@ -502,33 +559,7 @@ def forward( if tokens is not None and h is None: h = self.tok_embeddings(tokens) seqlen = h.shape[1] - - if self.use_kv_cache: - assert ( - input_pos is not None - ), "input_pos must be provided when use_kv_cache is True" - - if self.params.enable_dynamic_shape: - # when KV cache is used, seqlen is most likely 1. We want to slice from the start_pos. - input_pos_item = input_pos[-1].item() - torch._check_is_size(input_pos_item) - torch._check(input_pos_item < self.params.max_seq_len) - # pyre-ignore: Incompatible parameter type [6]: torch.narrow does expect int or Tensor - freqs_cos = self.freqs_cos.narrow(0, input_pos_item, seqlen) - # pyre-ignore: Incompatible parameter type [6] - freqs_sin = self.freqs_sin.narrow(0, input_pos_item, seqlen) - else: - # When not using dynamic shape, use of the .item results in - # symints, due to querying the data from tensor. - # this path avoids that for mps backend, although probably mps backend - # can support dynamic shape? - freqs_cos = self.freqs_cos[input_pos] - freqs_sin = self.freqs_sin[input_pos] - - else: - assert input_pos is None, "input_pos is unused when use_kv_cache is False" - freqs_cos = self.freqs_cos[:seqlen] - freqs_sin = self.freqs_sin[:seqlen] + freqs_cos, freqs_sin = self.rope.get_freqs(input_pos, seqlen) for layer in self.layers: h = layer( diff --git a/examples/models/llama/source_transformation/rope.py b/examples/models/llama/source_transformation/rope.py index a2a2264b24..79fb239966 100644 --- a/examples/models/llama/source_transformation/rope.py +++ b/examples/models/llama/source_transformation/rope.py @@ -13,23 +13,27 @@ def materialze_broadcast_of_rope_freq_cis( module: torch.nn.Module, ): assert isinstance(module, Transformer) - assert module.freqs_cos.dim() == 2 - dim0 = module.freqs_cos.size(0) - dim1 = module.freqs_cos.size(1) + assert module.rope.freqs_cos.dim() == 2 + dim0 = module.rope.freqs_cos.size(0) + dim1 = module.rope.freqs_cos.size(1) module_attention = module.layers[0].attention assert ( module_attention.n_local_kv_heads == module_attention.n_local_heads ), f"For rope freqs to be materialized for broadcast, q, k, v num heads must match. For q got {module_attention.n_kv_heads} for k got {module_attention.n_local_heads} and v got {module_attention.n_local_kv_heads}" num_heads = module_attention.n_local_heads - module.freqs_cos = module.freqs_cos.view(dim0, 1, dim1) - module.freqs_cos = module.freqs_cos.expand(dim0, num_heads, dim1).contiguous() - assert module.freqs_sin.dim() == 2 - assert dim0 == module.freqs_sin.size( + module.rope.freqs_cos = module.rope.freqs_cos.view(dim0, 1, dim1) + module.rope.freqs_cos = module.rope.freqs_cos.expand( + dim0, num_heads, dim1 + ).contiguous() + assert module.rope.freqs_sin.dim() == 2 + assert dim0 == module.rope.freqs_sin.size( 0 - ), f"sin and cos freq table sizes must match. Mismatch found at dim 0: {dim0} vs {module.freqs_sin.size(0)}" - assert dim1 == module.freqs_sin.size( + ), f"sin and cos freq table sizes must match. Mismatch found at dim 0: {dim0} vs {module.rope.freqs_sin.size(0)}" + assert dim1 == module.rope.freqs_sin.size( 1 - ), f"sin and cos freq table sizes must match. Mismatch found at dim 1: {dim1} vs {module.freqs_sin.size(1)}" - module.freqs_sin = module.freqs_sin.view(dim0, 1, dim1) - module.freqs_sin = module.freqs_sin.expand(dim0, num_heads, dim1).contiguous() + ), f"sin and cos freq table sizes must match. Mismatch found at dim 1: {dim1} vs {module.rope.freqs_sin.size(1)}" + module.rope.freqs_sin = module.rope.freqs_sin.view(dim0, 1, dim1) + module.rope.freqs_sin = module.rope.freqs_sin.expand( + dim0, num_heads, dim1 + ).contiguous() return module From 6b738410e400b173fdda78bea352aa5eb334e751 Mon Sep 17 00:00:00 2001 From: pytorchbot Date: Wed, 27 Nov 2024 12:03:49 -0800 Subject: [PATCH 13/27] implement position encoding for shifted tokens Pull Request resolved: https://github.com/pytorch/executorch/pull/6646 In AttentionSink, it uses tokens' positions in the KVCache instead of the actual text. When tokens get shifted in KVCache, it needs to update q and k's position embedding. In the original [implementation](https://github.com/mit-han-lab/streaming-llm) of AttentionSink with Rope, it caches the original q and k in KVCache and apply position embedding during inference. This PR adds `RopeWithAttentionSink`. It assumes that q and k are already encoded with their original position. When we shift tokens, we reapply the position delta. This has two benefits: - minimize our code since our existing `llama_transformer` applies rope embedding before doing KVCache update - avoid performance regression when tokens are not shifted because we don't need to reapply position encoding in KVCache for them ghstack-source-id: 255579838 Differential Revision: [D65366440](https://our.internmc.facebook.com/intern/diff/D65366440/) --------- Co-authored-by: Lunwen He --- examples/models/llama/TARGETS | 14 ++++ examples/models/llama/rope.py | 41 +++++++++++ .../source_transformation/attention_sink.py | 62 ++++++++++++++++ .../test_attention_sink.py | 73 +++++++++++++++++++ 4 files changed, 190 insertions(+) create mode 100644 examples/models/llama/source_transformation/attention_sink.py create mode 100644 examples/models/llama/source_transformation/test_attention_sink.py diff --git a/examples/models/llama/TARGETS b/examples/models/llama/TARGETS index cf387bfab2..284520d4d5 100644 --- a/examples/models/llama/TARGETS +++ b/examples/models/llama/TARGETS @@ -93,6 +93,7 @@ runtime.python_library( "source_transformation/sdpa.py", "source_transformation/spin_quant.py", "source_transformation/vulkan_rope.py", + "source_transformation/attention_sink.py", ], _is_external_target = True, base_module = "executorch.examples.models.llama", @@ -213,3 +214,16 @@ runtime.python_test( "//executorch/examples/models/llama:llama_transformer", ], ) + +runtime.python_test( + name = "attention_sink_test", + srcs = [ + "source_transformation/test_attention_sink.py", + ], + supports_static_listing = False, + deps = [ + "fbsource//third-party/pypi/parameterized:parameterized", + "//caffe2:torch", + ":export_library", + ], +) diff --git a/examples/models/llama/rope.py b/examples/models/llama/rope.py index 0383c79898..1445787f5e 100644 --- a/examples/models/llama/rope.py +++ b/examples/models/llama/rope.py @@ -92,6 +92,22 @@ def apply_rotary_emb( return xq_out.type_as(xq), xk_out.type_as(xk) +def apply_rotary_emb_to_k( + xk: torch.Tensor, freqs_cos: torch.Tensor, freqs_sin: torch.Tensor +) -> torch.Tensor: + xk_r, xk_i = xk.float().reshape(xk.shape[:-1] + (-1, 2)).unbind(-1) + + freqs_cos = reshape_for_broadcast(freqs_cos, xk_r) + freqs_sin = reshape_for_broadcast(freqs_sin, xk_r) + + xk_out_r = xk_r * freqs_cos - xk_i * freqs_sin + xk_out_i = xk_r * freqs_sin + xk_i * freqs_cos + + xk_out = torch.stack([xk_out_r, xk_out_i], dim=-1).flatten(3) + + return xk_out.type_as(xk) + + class RotaryEmbedding(torch.nn.Module): def __init__(self): super().__init__() @@ -160,3 +176,28 @@ def hf_apply_rotary_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1): q_embed = (q * cos) + (rotate_half(q) * sin) k_embed = (k * cos) + (rotate_half(k) * sin) return q_embed, k_embed + + +def hf_apply_rotary_emb_to_k(k, cos, sin, position_ids=None, unsqueeze_dim=1): + """Applies Rotary Position Embedding to the key tensors. + + Args: + k (`torch.Tensor`): The key tensor. + cos (`torch.Tensor`): The cosine part of the rotary embedding. + sin (`torch.Tensor`): The sine part of the rotary embedding. + position_ids (`torch.Tensor`, *optional*): + Deprecated and unused. + unsqueeze_dim (`int`, *optional*, defaults to 1): + The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and + sin[position_ids] so that they can be properly broadcasted to the dimensions of k. For example, note + that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if + k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes + cos[position_ids] and sin[position_ids] broadcastable to the shapes of k. Similarly, if k have + the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2. + Returns: + `torch.Tensor` the key tensor rotated using the Rotary Position Embedding. + """ + cos = cos.unsqueeze(unsqueeze_dim) + sin = sin.unsqueeze(unsqueeze_dim) + k_embed = (k * cos) + (rotate_half(k) * sin) + return k_embed diff --git a/examples/models/llama/source_transformation/attention_sink.py b/examples/models/llama/source_transformation/attention_sink.py new file mode 100644 index 0000000000..94f5b47871 --- /dev/null +++ b/examples/models/llama/source_transformation/attention_sink.py @@ -0,0 +1,62 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +# Components for supporting Attention Sink. See +# https://arxiv.org/abs/2309.17453 for more details about Attention Sink. + +import torch + +from executorch.examples.models.llama.llama_transformer import ModelArgs, Rope +from executorch.examples.models.llama.rope import ( + apply_rotary_emb_to_k, + hf_apply_rotary_emb_to_k, +) + + +class RopeWithAttentionSink(Rope): + """ + Rope that helps adjust position encoding when tokens are shifted in KVCache. + For AttentionSink, when tokens are shifted in KVCache, we need to use positions + in KVCache instead of positions in the actual text. + """ + + def __init__(self, params: ModelArgs): + super().__init__(params) + if self.params.use_hf_rope: + self.apply_rotary_emb_to_k = hf_apply_rotary_emb_to_k + else: + self.apply_rotary_emb_to_k = apply_rotary_emb_to_k + + def rerotate_k( + self, + k: torch.Tensor, + original_position: int, + new_position: int, + ): + """ + Rerotate k from original_position to new_position. This is done by rerotating + k with (new_position * theta - original_position * theta) with the following matrix: + (cos(delta), -sin(delta) + sin(delta), cos(delta)) + where delta = new_position * theta - original_position * theta + + The shape of k is (batch_size, seq_len, n_local_heads, head_dim) + + Based on https://github.com/huggingface/transformers/blame/main/src/transformers/cache_utils.py#L961 + """ + seq_len = k.shape[1] + original_freqs_cos = self.freqs_cos.narrow(0, original_position, seq_len) + original_freqs_sin = self.freqs_sin.narrow(0, original_position, seq_len) + new_freqs_cos = self.freqs_cos.narrow(0, new_position, seq_len) + new_freqs_sin = self.freqs_sin.narrow(0, new_position, seq_len) + rerotation_cos = ( + new_freqs_cos * original_freqs_cos + new_freqs_sin * original_freqs_sin + ) + rerotation_sin = ( + new_freqs_sin * original_freqs_cos - new_freqs_cos * original_freqs_sin + ) + + return self.apply_rotary_emb_to_k(k, rerotation_cos, rerotation_sin) diff --git a/examples/models/llama/source_transformation/test_attention_sink.py b/examples/models/llama/source_transformation/test_attention_sink.py new file mode 100644 index 0000000000..adb3bff3a5 --- /dev/null +++ b/examples/models/llama/source_transformation/test_attention_sink.py @@ -0,0 +1,73 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import unittest + +import torch +from executorch.examples.models.llama.llama_transformer import ModelArgs + +from executorch.examples.models.llama.source_transformation.attention_sink import ( + RopeWithAttentionSink, +) +from parameterized import parameterized + + +class RopeWithAttentionSinkTest(unittest.TestCase): + + def setUp(self): + torch.manual_seed(42) + self.params = ModelArgs(use_kv_cache=True, enable_dynamic_shape=True) + self.rope_with_attention_sink = RopeWithAttentionSink(params=self.params) + + @parameterized.expand( + [ + [128, 127], # Rotate left + [128, 128], # No rotation + [128, 129], # Rotate right + ] + ) + def test_rotate(self, original_position, new_position): + seq_len = 32 + + q = torch.rand( + 1, seq_len, self.params.n_heads, self.params.head_dim, dtype=torch.float32 + ) + k = torch.rand( + 1, + seq_len, + self.params.n_heads, + self.params.head_dim, + dtype=torch.float32, + ) + freqs_cos, freqs_sin = self.rope_with_attention_sink.get_freqs( + input_pos=torch.tensor([original_position], dtype=torch.int32), + seq_len=seq_len, + ) + _, pre_rotated_k = self.rope_with_attention_sink.forward( + q=q, + k=k, + freqs_cos=freqs_cos, + freqs_sin=freqs_sin, + ) + + rerotated_k = self.rope_with_attention_sink.rerotate_k( + k=pre_rotated_k, + original_position=original_position, + new_position=new_position, + ) + + freqs_cos, freqs_sin = self.rope_with_attention_sink.get_freqs( + input_pos=torch.tensor([new_position], dtype=torch.int32), + seq_len=seq_len, + ) + _, expected_k = self.rope_with_attention_sink.forward( + q=q, + k=k, + freqs_cos=freqs_cos, + freqs_sin=freqs_sin, + ) + + torch.testing.assert_close(rerotated_k, expected_k) From c726a9bf545f7721f7861aacda373775c1caa4c5 Mon Sep 17 00:00:00 2001 From: pytorchbot Date: Wed, 27 Nov 2024 12:06:49 -0800 Subject: [PATCH 14/27] Implement get_freqs for RopeWithAttentionSink This PR implements the `get_freqs` function for `RopeWithAttentionSink`. It returns the `freqs_cos` and `freqs_sin` for given `input_pos` and `seq_len` after shifting tokens in the pre-computed `freqs_cos` and `freq_sin`. Differential Revision: [D66525306](https://our.internmc.facebook.com/intern/diff/D66525306/) ghstack-source-id: 255582545 Pull Request resolved: https://github.com/pytorch/executorch/pull/7100 Co-authored-by: Lunwen He --- .../source_transformation/attention_sink.py | 29 ++++++++++- .../test_attention_sink.py | 51 ++++++++++++++++++- 2 files changed, 77 insertions(+), 3 deletions(-) diff --git a/examples/models/llama/source_transformation/attention_sink.py b/examples/models/llama/source_transformation/attention_sink.py index 94f5b47871..8f4fd1ebd2 100644 --- a/examples/models/llama/source_transformation/attention_sink.py +++ b/examples/models/llama/source_transformation/attention_sink.py @@ -7,6 +7,8 @@ # Components for supporting Attention Sink. See # https://arxiv.org/abs/2309.17453 for more details about Attention Sink. +from typing import Optional + import torch from executorch.examples.models.llama.llama_transformer import ModelArgs, Rope @@ -23,12 +25,37 @@ class RopeWithAttentionSink(Rope): in KVCache instead of positions in the actual text. """ - def __init__(self, params: ModelArgs): + def __init__( + self, + params: ModelArgs, + window_size: int, + sink_size: int, + eviction_batch_size: int, + ): super().__init__(params) if self.params.use_hf_rope: self.apply_rotary_emb_to_k = hf_apply_rotary_emb_to_k else: self.apply_rotary_emb_to_k = apply_rotary_emb_to_k + self.max_seq_length = window_size + sink_size + assert self.max_seq_length == self.params.max_seq_len + self.eviction_batch_size = eviction_batch_size + self.position_shift = 0 + + def get_freqs(self, input_pos: Optional[torch.Tensor], seq_len: int): + assert input_pos is not None + + input_pos_item = input_pos.item() + torch._check_is_size(input_pos_item) + if input_pos_item + self.position_shift + seq_len > self.max_seq_length: + # There are not enough spaces in the cache to store the new tokens. + # We need to evict some old tokens and shift some recent tokens. + num_to_evict = max( + input_pos_item + self.position_shift - self.max_seq_length + seq_len, + self.eviction_batch_size, + ) + self.position_shift -= num_to_evict # pyre-ignore [8] + return super().get_freqs(input_pos + self.position_shift, seq_len) def rerotate_k( self, diff --git a/examples/models/llama/source_transformation/test_attention_sink.py b/examples/models/llama/source_transformation/test_attention_sink.py index adb3bff3a5..8eaa992dc3 100644 --- a/examples/models/llama/source_transformation/test_attention_sink.py +++ b/examples/models/llama/source_transformation/test_attention_sink.py @@ -17,10 +17,57 @@ class RopeWithAttentionSinkTest(unittest.TestCase): + def _init_rope(self, params: ModelArgs, eviction_batch_size: int): + return RopeWithAttentionSink( + params=params, + window_size=252, + sink_size=4, + eviction_batch_size=eviction_batch_size, + ) + def setUp(self): torch.manual_seed(42) - self.params = ModelArgs(use_kv_cache=True, enable_dynamic_shape=True) - self.rope_with_attention_sink = RopeWithAttentionSink(params=self.params) + self.params = ModelArgs( + use_kv_cache=True, enable_dynamic_shape=True, max_seq_len=256 + ) + self.rope_with_attention_sink = self._init_rope( + params=self.params, eviction_batch_size=1 + ) + + @parameterized.expand( + [ + [0, 10, 1, 0], # No shift + [250, 10, 1, 246], # Some shift + [256, 10, 1, 246], # All shift + [0, 10, 30, 0], # No shift with batch eviction + [250, 10, 30, 220], # Some shift with batch eviction + [256, 10, 30, 226], # All shift with batch eviction + ] + ) + def test_get_freqs( + self, input_pos, seq_len, eviction_batch_size, expected_result_pos + ): + self.rope_with_attention_sink = self._init_rope( + params=self.params, eviction_batch_size=eviction_batch_size + ) + + freqs_cos, freqs_sin = self.rope_with_attention_sink.get_freqs( + input_pos=torch.tensor([input_pos], dtype=torch.int32), + seq_len=seq_len, + ) + + torch.testing.assert_close( + freqs_cos, + self.rope_with_attention_sink.freqs_cos.narrow( + 0, expected_result_pos, seq_len + ), + ) + torch.testing.assert_close( + freqs_sin, + self.rope_with_attention_sink.freqs_sin.narrow( + 0, expected_result_pos, seq_len + ), + ) @parameterized.expand( [ From 3a0e5273b7b858b45e6a95b174ec1c73208d206d Mon Sep 17 00:00:00 2001 From: Hansong <107070759+kirklandsign@users.noreply.github.com> Date: Wed, 27 Nov 2024 16:27:53 -0800 Subject: [PATCH 15/27] Fix cadence BUCK deps and pyre Differential Revision: D66553586 Pull Request resolved: https://github.com/pytorch/executorch/pull/7116 --- backends/cadence/runtime/TARGETS | 1 + backends/cadence/runtime/runtime.py | 3 --- backends/cadence/runtime/utils.py | 16 +++------------- examples/cadence/operators/test_add_op.py | 2 ++ extension/llm/export/quantizer_lib.py | 2 -- 5 files changed, 6 insertions(+), 18 deletions(-) diff --git a/backends/cadence/runtime/TARGETS b/backends/cadence/runtime/TARGETS index db3fe0ad1e..95a7bdc369 100644 --- a/backends/cadence/runtime/TARGETS +++ b/backends/cadence/runtime/TARGETS @@ -18,6 +18,7 @@ python_library( "//executorch/devtools/bundled_program:config", "//executorch/devtools/bundled_program:core", "//executorch/devtools/bundled_program/serialize:lib", + "//executorch/devtools:lib", "//executorch/exir:lib", ], ) diff --git a/backends/cadence/runtime/runtime.py b/backends/cadence/runtime/runtime.py index bf2932d9c7..0268931c40 100644 --- a/backends/cadence/runtime/runtime.py +++ b/backends/cadence/runtime/runtime.py @@ -167,9 +167,7 @@ def run( def compare( - # pyre-fixme[2]: Parameter annotation cannot be `Any`. outputs: Any, - # pyre-fixme[2]: Parameter annotation cannot be `Any`. ref_outputs: Any, name: str = "", eps_error: float = 1e-1, @@ -223,7 +221,6 @@ def run_and_compare( compare(outputs, ref_outputs, eps_error=eps_error, eps_warn=eps_warn) -# pyre-fixme[24]: Generic type `np.ndarray` expects 2 type parameters. def to_nd_array(v: Union[bool, numbers.Number, ndarray, torch.Tensor]) -> np.ndarray: if isinstance(v, np.ndarray): return v diff --git a/backends/cadence/runtime/utils.py b/backends/cadence/runtime/utils.py index b3ed622e8b..0a85b6dd61 100644 --- a/backends/cadence/runtime/utils.py +++ b/backends/cadence/runtime/utils.py @@ -13,12 +13,11 @@ import torch -# pyre-fixme[24]: Generic type `np.ndarray` expects 2 type parameters. -def distance(fn: Callable[[np.ndarray, np.ndarray], float]) -> Callable[ +def distance( + fn: Callable[[np.ndarray, np.ndarray], float], +) -> Callable[ [ - # pyre-fixme[24]: Generic type `np.ndarray` expects 2 type parameters. typing.Union[np.ndarray, torch._tensor.Tensor], - # pyre-fixme[24]: Generic type `np.ndarray` expects 2 type parameters. typing.Union[np.ndarray, torch._tensor.Tensor], ], float, @@ -27,9 +26,7 @@ def distance(fn: Callable[[np.ndarray, np.ndarray], float]) -> Callable[ # the distance between two N-D tensors given a function. This can be a RMS # function, maximum abs diff, or any kind of distance function. def wrapper( - # pyre-fixme[24]: Generic type `np.ndarray` expects 2 type parameters. a: Union[np.ndarray, torch.Tensor], - # pyre-fixme[24]: Generic type `np.ndarray` expects 2 type parameters. b: Union[np.ndarray, torch.Tensor], ) -> float: # convert a and b to np.ndarray type fp64 @@ -68,24 +65,20 @@ def wrapper( @distance -# pyre-fixme[24]: Generic type `np.ndarray` expects 2 type parameters. def rms(a: np.ndarray, b: np.ndarray) -> float: return ((a - b) ** 2).mean() ** 0.5 @distance -# pyre-fixme[24]: Generic type `np.ndarray` expects 2 type parameters. def max_abs_diff(a: np.ndarray, b: np.ndarray) -> float: return np.abs(a - b).max() @distance -# pyre-fixme[24]: Generic type `np.ndarray` expects 2 type parameters. def max_rel_diff(x: np.ndarray, x_ref: np.ndarray) -> float: return np.abs((x - x_ref) / x_ref).max() -# pyre-fixme[24]: Generic type `np.ndarray` expects 2 type parameters. def to_np_arr_fp64(x: Union[np.ndarray, torch.Tensor]) -> np.ndarray: if isinstance(x, torch.Tensor): x = x.detach().cpu().numpy() @@ -94,11 +87,8 @@ def to_np_arr_fp64(x: Union[np.ndarray, torch.Tensor]) -> np.ndarray: return x -# pyre-fixme[3]: Return type must be annotated. def normalized_rms( - # pyre-fixme[24]: Generic type `np.ndarray` expects 2 type parameters. predicted: Union[np.ndarray, torch.Tensor], - # pyre-fixme[24]: Generic type `np.ndarray` expects 2 type parameters. ground_truth: Union[np.ndarray, torch.Tensor], ): num = rms(predicted, ground_truth) diff --git a/examples/cadence/operators/test_add_op.py b/examples/cadence/operators/test_add_op.py index 5481540b4f..7799fe624b 100644 --- a/examples/cadence/operators/test_add_op.py +++ b/examples/cadence/operators/test_add_op.py @@ -13,6 +13,7 @@ class ATenOpTestCases(unittest.TestCase): + # pyre-fixme[16]: Module `parameterized.parameterized` has no attribute `expand`. @parameterized.expand( [ [(7, 5, 6), (7, 5, 6)], @@ -61,6 +62,7 @@ def forward(self, x: torch.Tensor, y: torch.Tensor): model, (X, Y), file_name=self._testMethodName, run_and_compare=False ) + # pyre-fixme[16]: Module `parameterized.parameterized` has no attribute `expand`. @parameterized.expand( [ [(7, 5, 6), (7, 5, 6)], diff --git a/extension/llm/export/quantizer_lib.py b/extension/llm/export/quantizer_lib.py index ba281864a9..3a9eebd2c3 100644 --- a/extension/llm/export/quantizer_lib.py +++ b/extension/llm/export/quantizer_lib.py @@ -184,14 +184,12 @@ def get_qnn_quantizer( ) qnn_quantizer.set_per_channel_conv_quant(enable=False) qnn_quantizer.set_per_channel_linear_quant(enable=False) - # pyre-ignore: Undefined attribute [16]: Module `executorch.backends` has no attribute `qualcomm`. qnn_quantizer.set_quant_config( quant_dtype, is_qat=is_qat, act_observer=MinMaxObserver ) elif quant_config == "16a4w": # pyre-ignore: Undefined attribute [16]: Module `executorch.backends` has no attribute `qualcomm`. quant_dtype = QuantDtype.use_16a4w - # pyre-ignore: Undefined attribute [16]: Module `executorch.backends` has no attribute `qualcomm`. qnn_quantizer.set_quant_config( quant_dtype, is_qat=is_qat, act_observer=MinMaxObserver ) From 651af12b19e9aa109ce974a0d95a1710b397d6c4 Mon Sep 17 00:00:00 2001 From: lg-zhang Date: Wed, 27 Nov 2024 16:59:46 -0800 Subject: [PATCH 16/27] use ovrsource libtorch in executorch Differential Revision: D66526578 Pull Request resolved: https://github.com/pytorch/executorch/pull/7101 --- extension/pytree/aten_util/targets.bzl | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/extension/pytree/aten_util/targets.bzl b/extension/pytree/aten_util/targets.bzl index e179308020..5ba7e90596 100644 --- a/extension/pytree/aten_util/targets.bzl +++ b/extension/pytree/aten_util/targets.bzl @@ -20,13 +20,7 @@ def define_common_targets(): "//executorch/runtime/platform:platform", ], compiler_flags = ["-Wno-missing-prototypes"], - fbcode_deps = [ - "//caffe2:ATen-core", - "//caffe2:ATen-cpu", - "//caffe2/c10:c10", - ], - xplat_deps = [ - "//xplat/caffe2:torch_mobile_core", - "//xplat/caffe2/c10:c10", + external_deps = [ + "torch-core-cpp", ], ) From d243ffecf790295be0716c28b019b50d7fa13147 Mon Sep 17 00:00:00 2001 From: Hannes Friederich Date: Thu, 28 Nov 2024 10:13:49 +0100 Subject: [PATCH 17/27] Back out "use ovrsource libtorch in executorch" Differential Revision: D66570005 Pull Request resolved: https://github.com/pytorch/executorch/pull/7122 --- extension/pytree/aten_util/targets.bzl | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/extension/pytree/aten_util/targets.bzl b/extension/pytree/aten_util/targets.bzl index 5ba7e90596..e179308020 100644 --- a/extension/pytree/aten_util/targets.bzl +++ b/extension/pytree/aten_util/targets.bzl @@ -20,7 +20,13 @@ def define_common_targets(): "//executorch/runtime/platform:platform", ], compiler_flags = ["-Wno-missing-prototypes"], - external_deps = [ - "torch-core-cpp", + fbcode_deps = [ + "//caffe2:ATen-core", + "//caffe2:ATen-cpu", + "//caffe2/c10:c10", + ], + xplat_deps = [ + "//xplat/caffe2:torch_mobile_core", + "//xplat/caffe2/c10:c10", ], ) From 7c934db080b9ecd97190d76f7fc815dfa53e7f5a Mon Sep 17 00:00:00 2001 From: Zingo Andersen Date: Tue, 19 Nov 2024 09:23:09 +0100 Subject: [PATCH 18/27] Arm Backend: Update Ethos-U compiler Vela to 4.1.0 This fix a code generation problem. Some Ethos-U85 tests starts working and some Ethos-U55 tests are disabled due to more stricts testing added to vela compiler. Signed-off-by: Zingo Andersen Change-Id: I53a10a1675cea34e105e04f864dfa3cb4cc626fa --- backends/arm/test/ops/test_bmm.py | 19 +++++--- backends/arm/test/ops/test_conv_combos.py | 2 - backends/arm/test/ops/test_depthwise_conv.py | 25 ++++++++--- backends/arm/test/ops/test_div.py | 46 +++++++++++++++----- backends/arm/test/ops/test_layer_norm.py | 7 ++- backends/arm/test/ops/test_logsoftmax.py | 46 ++++++++++++++++---- backends/arm/test/ops/test_mean_dim.py | 4 +- backends/arm/test/ops/test_mul.py | 7 +-- backends/arm/test/ops/test_softmax.py | 29 +++++++++++- backends/arm/test/ops/test_sum.py | 24 +++++++++- backends/arm/test/ops/test_var.py | 32 ++++++++++++-- examples/arm/aot_arm_compiler.py | 4 +- examples/arm/setup.sh | 11 +++-- 13 files changed, 201 insertions(+), 55 deletions(-) diff --git a/backends/arm/test/ops/test_bmm.py b/backends/arm/test/ops/test_bmm.py index 824ec46372..2cf90b2119 100644 --- a/backends/arm/test/ops/test_bmm.py +++ b/backends/arm/test/ops/test_bmm.py @@ -22,8 +22,8 @@ class TestBMM(unittest.TestCase): class BMM(torch.nn.Module): test_parameters = [ - (torch.rand(5, 3, 5), torch.rand(5, 5, 2)), (torch.rand(2, 1, 1), torch.rand(2, 1, 1)), + (torch.rand(5, 3, 5), torch.rand(5, 5, 2)), (torch.ones(1, 55, 3), torch.ones(1, 3, 44)), (10000 * torch.randn(10, 1, 10), torch.randn(10, 10, 5)), (-10 * torch.randn(2, 32, 64), 5 + 5 * torch.randn(2, 64, 32)), @@ -147,32 +147,37 @@ def test_bmm_single_input_tosa_BI(self, operand1: torch.Tensor): @parameterized.expand(BMM.test_parameters) @unittest.expectedFailure - def test_bmm_u55_BI(self, operand1: torch.Tensor, operand2: torch.Tensor): + def test_bmm_u55_BI_xfails(self, operand1: torch.Tensor, operand2: torch.Tensor): test_data = (operand1, operand2) self._test_bmm_ethosu_BI_pipeline( self.BMM(), common.get_u55_compile_spec(), test_data ) - @parameterized.expand(BMM.test_parameters) - @common.expectedFailureOnFVP + @parameterized.expand(BMM.test_parameters[:1]) def test_bmm_u85_BI(self, operand1: torch.Tensor, operand2: torch.Tensor): test_data = (operand1, operand2) self._test_bmm_ethosu_BI_pipeline( self.BMM(), common.get_u85_compile_spec(), test_data ) + @parameterized.expand(BMM.test_parameters[1:]) + @unittest.expectedFailure + def test_bmm_u85_BI_xfails(self, operand1: torch.Tensor, operand2: torch.Tensor): + test_data = (operand1, operand2) + self._test_bmm_ethosu_BI_pipeline( + self.BMM(), common.get_u85_compile_spec(), test_data + ) + # Expected to fail with error: Warning, unsupported fusing of TOSA Rescale previous operator is of type: Memcpy @parameterized.expand(BMMSingleInput.test_parameters) @unittest.expectedFailure - def test_bmm_single_input_u55_BI(self, operand1: torch.Tensor): + def test_bmm_single_input_u55_BI_xfails(self, operand1: torch.Tensor): test_data = (operand1,) self._test_bmm_ethosu_BI_pipeline( self.BMMSingleInput(), common.get_u55_compile_spec(), test_data ) - # Numerical issues on FVP, MLETORCH 534 @parameterized.expand(BMMSingleInput.test_parameters) - @common.expectedFailureOnFVP def test_bmm_single_input_u85_BI(self, operand1: torch.Tensor): test_data = (operand1,) self._test_bmm_ethosu_BI_pipeline( diff --git a/backends/arm/test/ops/test_conv_combos.py b/backends/arm/test/ops/test_conv_combos.py index 7555fff720..001c4a2bd5 100644 --- a/backends/arm/test/ops/test_conv_combos.py +++ b/backends/arm/test/ops/test_conv_combos.py @@ -275,8 +275,6 @@ def test_conv_meandim_u55_BI(self): model.get_inputs(), ) - # Numerical Issues on FVP, MLETORCH-520 - @common.expectedFailureOnFVP def test_conv_meandim_u85_BI(self): model = ComboConv2dMeandim() self._test_conv_combo_ethos_BI_pipeline( diff --git a/backends/arm/test/ops/test_depthwise_conv.py b/backends/arm/test/ops/test_depthwise_conv.py index 28cb9ac844..628b25c259 100644 --- a/backends/arm/test/ops/test_depthwise_conv.py +++ b/backends/arm/test/ops/test_depthwise_conv.py @@ -156,6 +156,19 @@ ("two_dw_conv2d", two_dw_conv2d), ] +testsuite_conv2d_u85 = [ + ("2x2_1x6x4x4_gp6_st1", dw_conv2d_2x2_1x6x4x4_gp6_st1), + ("3x3_1x3x256x256_gp3_st1", dw_conv2d_3x3_1x3x256x256_gp3_st1), + ("3x3_1x4x256x256_gp4_st1", dw_conv2d_3x3_1x4x256x256_gp4_st1), + ("3x3_1x4x256x256_gp4_nobias", dw_conv2d_3x3_1x4x256x256_gp4_nobias), +] + +testsuite_conv2d_u85_xfails = [ + ("3x3_2x8x198x198_gp8_st3", dw_conv2d_3x3_2x8x198x198_gp8_st3), + ("two_dw_conv2d", two_dw_conv2d), +] + + testsuite_conv1d = [ ("2_1x6x4_gp6_st1", dw_conv1d_2_1x6x4_gp6_st1), ("two_dw_conv1d", two_dw_conv1d), @@ -274,10 +287,8 @@ def test_dw_conv1d_u55_BI( model.get_inputs(), ) - # All test cases except 3x3_1x3x256x256_gp3_st1 have numerical issues on FVP. MLETORCH-520 - @parameterized.expand(testsuite_conv1d[:-2] + testsuite_conv2d) - @common.expectedFailureOnFVP - def test_dw_conv_u85_BI_xfails( + @parameterized.expand(testsuite_conv1d + testsuite_conv2d_u85) + def test_dw_conv_u85_BI( self, test_name: str, model: torch.nn.Module, set_quantize_io: bool = False ): self._test_dw_conv_ethos_BI_pipeline( @@ -288,8 +299,10 @@ def test_dw_conv_u85_BI_xfails( model.get_inputs(), ) - @parameterized.expand(testsuite_conv1d[-2:]) - def test_dw_conv_u85_BI( + # All test cases except 3x3_1x3x256x256_gp3_st1 have numerical issues on FVP. MLETORCH-520 + @parameterized.expand(testsuite_conv2d_u85_xfails) + @common.expectedFailureOnFVP + def test_dw_conv_u85_BI_xfails( self, test_name: str, model: torch.nn.Module, set_quantize_io: bool = False ): self._test_dw_conv_ethos_BI_pipeline( diff --git a/backends/arm/test/ops/test_div.py b/backends/arm/test/ops/test_div.py index b3815f3e7c..27febd714e 100644 --- a/backends/arm/test/ops/test_div.py +++ b/backends/arm/test/ops/test_div.py @@ -26,18 +26,18 @@ torch.ones(5), None, ), - ( - "op_div_rank1_rand", - torch.rand(5) * 5, - torch.rand(5) * 5, - None, - ), ( "op_div_rank1_negative_ones", torch.ones(5) * (-1), torch.ones(5) * (-1), None, ), + ( + "op_div_rank1_rand", + torch.rand(5) * 5, + torch.rand(5) * 5, + None, + ), ( "op_div_rank4_ones", torch.ones(5, 10, 25, 20), @@ -183,9 +183,7 @@ def test_div_tosa_BI( test_data = (input_, other_) self._test_div_tosa_BI_pipeline(self.Div(), test_data) - # Numerical issues on FVP likely due to mul op, MLETORCH-521 - @parameterized.expand(test_data_suite) - @common.expectedFailureOnFVP + @parameterized.expand(test_data_suite[:2]) def test_div_u55_BI( self, test_name: str, @@ -199,8 +197,21 @@ def test_div_u55_BI( ) # Numerical issues on FVP likely due to mul op, MLETORCH-521 - @parameterized.expand(test_data_suite) + @parameterized.expand(test_data_suite[2:]) @common.expectedFailureOnFVP + def test_div_u55_BI_xfails( + self, + test_name: str, + input_: Union[torch.Tensor, torch.types.Number], + other_: Union[torch.Tensor, torch.types.Number], + rounding_mode: Optional[str] = None, + ): + test_data = (input_, other_) + self._test_div_ethos_BI_pipeline( + self.Div(), common.get_u55_compile_spec(), test_data + ) + + @parameterized.expand(test_data_suite[:2]) def test_div_u85_BI( self, test_name: str, @@ -212,3 +223,18 @@ def test_div_u85_BI( self._test_div_ethos_BI_pipeline( self.Div(), common.get_u85_compile_spec(), test_data ) + + # Numerical issues on FVP likely due to mul op, MLETORCH-521 + @parameterized.expand(test_data_suite[2:]) + @common.expectedFailureOnFVP + def test_div_u85_BI_xfails( + self, + test_name: str, + input_: Union[torch.Tensor, torch.types.Number], + other_: Union[torch.Tensor, torch.types.Number], + rounding_mode: Optional[str] = None, + ): + test_data = (input_, other_) + self._test_div_ethos_BI_pipeline( + self.Div(), common.get_u85_compile_spec(), test_data + ) diff --git a/backends/arm/test/ops/test_layer_norm.py b/backends/arm/test/ops/test_layer_norm.py index 0b06044a59..7375a25383 100644 --- a/backends/arm/test/ops/test_layer_norm.py +++ b/backends/arm/test/ops/test_layer_norm.py @@ -170,9 +170,8 @@ def test_layer_norm_u55_BI( ) # Numerical issues on FVP likely due to mul op, MLETORCH-521 - @parameterized.expand(test_data_suite[:-1]) - @common.expectedFailureOnFVP - def test_layer_norm_u85_BI_fvp_xfails( + @parameterized.expand(test_data_suite[:-2]) + def test_layer_norm_u85_BI_fvp( self, test_name: str, test_data: torch.Tensor, @@ -182,7 +181,7 @@ def test_layer_norm_u85_BI_fvp_xfails( self.LayerNorm(*model_params), common.get_u85_compile_spec(), (test_data,) ) - @parameterized.expand(test_data_suite[-1:]) + @parameterized.expand(test_data_suite[-2:]) @unittest.skip # Flaky def test_layer_norm_u85_BI( self, diff --git a/backends/arm/test/ops/test_logsoftmax.py b/backends/arm/test/ops/test_logsoftmax.py index 5d84fa127f..910384e0a0 100644 --- a/backends/arm/test/ops/test_logsoftmax.py +++ b/backends/arm/test/ops/test_logsoftmax.py @@ -17,14 +17,29 @@ test_data_suite = [ # (test_name, test_data, dim) - ("zeros", torch.zeros(10, 10, 10, 10), 0), - ("zeros_neg_dim", torch.zeros(10, 10, 10, 10), -4), + ("zeros", torch.zeros(10, 8, 5, 2), 0), + ("zeros_neg_dim", torch.zeros(10, 7, 8, 9), -4), ("ones", torch.ones(10, 10), 1), - ("rand_neg_dim", torch.rand(10, 10, 10), -1), - ("rand", torch.rand(10, 10, 10, 10), 2), - ("rand_neg_dim", torch.rand(10, 10, 2, 3), -2), - ("randn", torch.randn(10, 10, 5, 10), 3), - ("randn_neg_dim", torch.randn(1, 10, 10, 10), -3), + ("ones_neg_dim", torch.ones(10, 3, 4), -1), + ("rand", torch.rand(1, 2, 5, 8), 2), + ("rand_neg_dim", torch.rand(2, 10, 8, 10), -2), + ("randn", torch.randn(10, 10, 10, 10), 3), + ("randn_neg_dim", torch.randn(10, 5, 8, 7), -3), +] +test_data_suite_u55 = [ + # (test_name, test_data, dim) + ("ones", torch.ones(10, 10), 1), + ("ones_neg_dim", torch.ones(10, 3, 4), -1), + ("randn_neg_dim", torch.randn(10, 5, 8, 7), -3), +] + +test_data_suite_u55_xfails = [ + # (test_name, test_data, dim) + ("zeros", torch.zeros(10, 8, 5, 2), 0), + ("zeros_neg_dim", torch.zeros(10, 7, 8, 9), -4), + ("rand", torch.rand(1, 2, 5, 8), 2), + ("rand_neg_dim", torch.rand(2, 10, 8, 10), -2), + ("randn", torch.randn(10, 10, 10, 10), 3), ] @@ -135,7 +150,7 @@ def test_logsoftmax_tosa_BI( ): self._test_logsoftmax_tosa_BI_pipeline(self.LogSoftmax(dim=dim), (test_data,)) - @parameterized.expand(test_data_suite) + @parameterized.expand(test_data_suite_u55) def test_logsoftmax_tosa_u55_BI( self, test_name: str, @@ -146,6 +161,19 @@ def test_logsoftmax_tosa_u55_BI( self.LogSoftmax(dim=dim), (test_data,) ) + # Expected to fail as this is not supported on u55. + @parameterized.expand(test_data_suite_u55_xfails) + @unittest.expectedFailure + def test_logsoftmax_tosa_u55_BI_xfails( + self, + test_name: str, + test_data: torch.Tensor, + dim: int, + ): + self._test_logsoftmax_tosa_u55_BI_pipeline( + self.LogSoftmax(dim=dim), (test_data,) + ) + @parameterized.expand(test_data_suite) def test_logsoftmax_tosa_u85_BI( self, @@ -153,6 +181,6 @@ def test_logsoftmax_tosa_u85_BI( test_data: torch.Tensor, dim: int, ): - self._test_logsoftmax_tosa_u55_BI_pipeline( + self._test_logsoftmax_tosa_u85_BI_pipeline( self.LogSoftmax(dim=dim), (test_data,) ) diff --git a/backends/arm/test/ops/test_mean_dim.py b/backends/arm/test/ops/test_mean_dim.py index e8320cf1df..3cb8c5f815 100644 --- a/backends/arm/test/ops/test_mean_dim.py +++ b/backends/arm/test/ops/test_mean_dim.py @@ -269,8 +269,10 @@ def test_meandim_tosa_BI( ): self._test_meandim_tosa_BI_pipeline(self.MeanDim(dim, keepdim), (test_data,)) + # Expected to fail as this is not supported on u55. @parameterized.expand(MeanDim.test_data_suite) - def test_meandim_tosa_u55_BI( + @unittest.expectedFailure + def test_meandim_tosa_u55_BI_xfails( self, test_name: str, test_data: torch.Tensor, diff --git a/backends/arm/test/ops/test_mul.py b/backends/arm/test/ops/test_mul.py index 8f0321ea5f..6d6922628e 100644 --- a/backends/arm/test/ops/test_mul.py +++ b/backends/arm/test/ops/test_mul.py @@ -152,9 +152,7 @@ def test_mul_tosa_BI( test_data = (input_, other_) self._test_mul_tosa_BI_pipeline(self.Mul(), test_data) - # Numerical issues on FVP, MLETORCH-521 @parameterized.expand(test_data_sute) - @common.expectedFailureOnFVP def test_mul_u55_BI( self, test_name: str, @@ -166,10 +164,7 @@ def test_mul_u55_BI( common.get_u55_compile_spec(), self.Mul(), test_data ) - # Numerical issues on FVP, MLETORCH-521 - # test_data_sute[0] works on U85 - @parameterized.expand(test_data_sute[1:]) - @common.expectedFailureOnFVP + @parameterized.expand(test_data_sute) def test_mul_u85_BI( self, test_name: str, diff --git a/backends/arm/test/ops/test_softmax.py b/backends/arm/test/ops/test_softmax.py index f883d6b8de..30215b47f3 100644 --- a/backends/arm/test/ops/test_softmax.py +++ b/backends/arm/test/ops/test_softmax.py @@ -28,6 +28,22 @@ ("randn_neg_dim", torch.randn(10, 5, 8, 7), -3), ] +test_data_suite_u55 = [ + # (test_name, test_data, dim) + ("ones", torch.ones(10, 10), 1), + ("ones_neg_dim", torch.ones(10, 3, 4), -1), + ("randn_neg_dim", torch.randn(10, 5, 8, 7), -3), +] + +test_data_suite_u55_xfails = [ + # (test_name, test_data, dim) + ("zeros", torch.zeros(10, 8, 5, 2), 0), + ("zeros_neg_dim", torch.zeros(10, 7, 8, 9), -4), + ("rand", torch.rand(1, 2, 5, 8), 2), + ("rand_neg_dim", torch.rand(2, 10, 8, 10), -2), + ("randn", torch.randn(10, 10, 10, 10), 3), +] + class TestSoftmax(unittest.TestCase): """Tests softmax.""" @@ -136,7 +152,7 @@ def test_softmax_tosa_BI( ): self._test_softmax_tosa_BI_pipeline(self.Softmax(dim=dim), (test_data,)) - @parameterized.expand(test_data_suite) + @parameterized.expand(test_data_suite_u55) def test_softmax_tosa_u55_BI( self, test_name: str, @@ -145,6 +161,17 @@ def test_softmax_tosa_u55_BI( ): self._test_softmax_tosa_u55_BI_pipeline(self.Softmax(dim=dim), (test_data,)) + # Expected to fail as this is not supported on u55. + @parameterized.expand(test_data_suite_u55_xfails) + @unittest.expectedFailure + def test_softmax_tosa_u55_BI_xfails( + self, + test_name: str, + test_data: torch.Tensor, + dim: int, + ): + self._test_softmax_tosa_u55_BI_pipeline(self.Softmax(dim=dim), (test_data,)) + @parameterized.expand(test_data_suite) def test_softmax_tosa_u85_BI( self, diff --git a/backends/arm/test/ops/test_sum.py b/backends/arm/test/ops/test_sum.py index 9cd63b0a22..111517afbb 100644 --- a/backends/arm/test/ops/test_sum.py +++ b/backends/arm/test/ops/test_sum.py @@ -35,6 +35,18 @@ class Sum(torch.nn.Module): ((torch.rand(1, 2, 8, 8), [2, 3, 0], True),), ] + test_parameters_u55: list[Tuple[exampledata_t]] = [ + ((torch.rand(10), 0, True),), + ((torch.rand(10, 10), 1, False),), + ((torch.rand(1, 2, 3, 4), 3, True),), + ] + + test_parameters_u55_xfails: list[Tuple[exampledata_t]] = [ + ((torch.rand(10, 10, 10), [-3, 1], True),), + ((torch.rand(2, 1, 5, 8), 1, False),), + ((torch.rand(1, 2, 8, 8), [2, 3, 0], True),), + ] + def forward(self, x: torch.Tensor, dim: int, keepdim: bool): return x.sum(dim=dim, keepdim=keepdim) @@ -112,7 +124,7 @@ def test_sum_tosa_MI(self, test_data: tuple[exampledata_t]): def test_sum_tosa_BI(self, test_data: tuple[exampledata_t]): self._test_sum_tosa_BI_pipeline(self.Sum(), test_data) - @parameterized.expand(Sum.test_parameters) + @parameterized.expand(Sum.test_parameters_u55) def test_sum_u55_BI(self, test_data: tuple[exampledata_t]): self._test_sum_ethosu_BI_pipeline( self.Sum(), @@ -120,6 +132,16 @@ def test_sum_u55_BI(self, test_data: tuple[exampledata_t]): common.get_u55_compile_spec(permute_memory_to_nhwc=False), ) + # Expected to fail as this is not supported on u55. + @parameterized.expand(Sum.test_parameters_u55_xfails) + @unittest.expectedFailure + def test_sum_u55_BI_xfails(self, test_data: tuple[exampledata_t]): + self._test_sum_ethosu_BI_pipeline( + self.Sum(), + test_data, + common.get_u55_compile_spec(permute_memory_to_nhwc=False), + ) + @parameterized.expand(Sum.test_parameters) def test_sum_u85_BI(self, test_data: tuple[exampledata_t]): self._test_sum_ethosu_BI_pipeline( diff --git a/backends/arm/test/ops/test_var.py b/backends/arm/test/ops/test_var.py index 3a1285e6da..06671848cc 100644 --- a/backends/arm/test/ops/test_var.py +++ b/backends/arm/test/ops/test_var.py @@ -50,6 +50,16 @@ class VarDim(torch.nn.Module): (torch.rand(1, 50, 10, 20), -1, True, True), ] + test_parameters_u55 = [ + (torch.randn(1, 50, 10, 20), 1, True, False), + (torch.randn(1, 30, 15, 20), -3, True, True), + ] + + test_parameters_u55_xfails = [ + (torch.rand(1, 50, 10), -2, True, False), + (torch.rand(1, 50, 10, 20), -1, True, True), + ] + def forward( self, x: torch.Tensor, @@ -148,8 +158,10 @@ def test_var_tosa_MI(self, test_tensor: torch.Tensor, keepdim, correction): def test_var_tosa_BI(self, test_tensor: torch.Tensor, keepdim, correction): self._test_var_tosa_BI_pipeline(self.Var(), (test_tensor, keepdim, correction)) + # Expected to fail as this is not supported on u55. @parameterized.expand(Var.test_parameters) - def test_var_u55_BI(self, test_tensor: torch.Tensor, keepdim, correction): + @unittest.expectedFailure + def test_var_u55_BI_xfails(self, test_tensor: torch.Tensor, keepdim, correction): self._test_var_ethosu_BI_pipeline( self.Var(), common.get_u55_compile_spec(), @@ -176,7 +188,7 @@ def test_var_dim_tosa_BI(self, test_tensor: torch.Tensor, dim, keepdim, correcti self.VarDim(), (test_tensor, dim, keepdim, correction) ) - @parameterized.expand(VarDim.test_parameters) + @parameterized.expand(VarDim.test_parameters_u55) def test_var_dim_u55_BI(self, test_tensor: torch.Tensor, dim, keepdim, correction): self._test_var_ethosu_BI_pipeline( self.VarDim(), @@ -184,6 +196,18 @@ def test_var_dim_u55_BI(self, test_tensor: torch.Tensor, dim, keepdim, correctio (test_tensor, dim, keepdim, correction), ) + # Expected to fail as this is not supported on u55. + @parameterized.expand(VarDim.test_parameters_u55_xfails) + @unittest.expectedFailure + def test_var_dim_u55_BI_xfails( + self, test_tensor: torch.Tensor, dim, keepdim, correction + ): + self._test_var_ethosu_BI_pipeline( + self.VarDim(), + common.get_u55_compile_spec(), + (test_tensor, dim, keepdim, correction), + ) + @parameterized.expand(VarDim.test_parameters) def test_var_dim_u85_BI(self, test_tensor: torch.Tensor, dim, keepdim, correction): self._test_var_ethosu_BI_pipeline( @@ -208,8 +232,10 @@ def test_var_correction_tosa_BI( self.VarCorrection(), (test_tensor, dim, keepdim, correction) ) + # Expected to fail as this is not supported on u55. @parameterized.expand(VarCorrection.test_parameters) - def test_var_correction_u55_BI( + @unittest.expectedFailure + def test_var_correction_u55_BI_xfails( self, test_tensor: torch.Tensor, dim, keepdim, correction ): self._test_var_ethosu_BI_pipeline( diff --git a/examples/arm/aot_arm_compiler.py b/examples/arm/aot_arm_compiler.py index ddd5fd6b0b..a16d947dd6 100644 --- a/examples/arm/aot_arm_compiler.py +++ b/examples/arm/aot_arm_compiler.py @@ -263,7 +263,7 @@ def get_compile_spec( target, system_config="Ethos_U55_High_End_Embedded", memory_mode="Shared_Sram", - extra_flags="--debug-force-regor --output-format=raw", + extra_flags="--debug-force-regor --output-format=raw --verbose-operators --verbose-cycle-estimate", ) .set_permute_memory_format(True) .set_quantize_io(True) @@ -276,7 +276,7 @@ def get_compile_spec( target, system_config="Ethos_U85_SYS_DRAM_Mid", memory_mode="Shared_Sram", - extra_flags="--output-format=raw", + extra_flags="--output-format=raw --verbose-operators --verbose-cycle-estimate", ) .set_permute_memory_format(True) .set_quantize_io(True) diff --git a/examples/arm/setup.sh b/examples/arm/setup.sh index 583237729d..84f2371466 100755 --- a/examples/arm/setup.sh +++ b/examples/arm/setup.sh @@ -89,7 +89,11 @@ ethos_u_base_rev="24.08" # tosa reference model tosa_reference_model_url="https://review.mlplatform.org/tosa/reference_model" tosa_reference_model_rev="f9ea4ab7da19318fe36b1c34d68a3e40fd6e56c5" - + +# vela +vela_repo_url="https://review.mlplatform.org/ml/ethos-u/ethos-u-vela" +vela_rev="a08fc18780827b5fefc814dd0162ee6317ce0ae7" + ######## ### Mandatory user args ######## @@ -198,6 +202,7 @@ function setup_ethos_u() { cd ethos-u git reset --hard ${ethos_u_base_rev} python3 ./fetch_externals.py -c ${ethos_u_base_rev}.json fetch + pip install pyelftools echo "[${FUNCNAME[0]}] Done @ $(git describe --all --long 3> /dev/null) in ${root_dir}/ethos-u dir." } @@ -259,9 +264,9 @@ function setup_vela() { # cd "${root_dir}" if [[ ! -e ethos-u-vela ]]; then - git clone https://review.mlplatform.org/ml/ethos-u/ethos-u-vela + git clone ${vela_repo_url} repo_dir="${root_dir}/ethos-u-vela" - base_rev=57ce18c89ccc6f6309333dccb24ed30dc68b571f + base_rev=${vela_rev} patch_repo fi cd "${root_dir}/ethos-u-vela" From e83ab0e06a8f1b4153df8f136f3a8ad0455dbd7c Mon Sep 17 00:00:00 2001 From: Zingo Andersen Date: Tue, 19 Nov 2024 09:23:09 +0100 Subject: [PATCH 19/27] Arm Backend: Update Ethos-U compiler Vela to 4.1.0 This fix a code generation problem. Some Ethos-U85 tests starts working and some Ethos-U55 tests are disabled due to more stricts testing added to vela compiler. Signed-off-by: Zingo Andersen Change-Id: I53a10a1675cea34e105e04f864dfa3cb4cc626fa --- backends/arm/test/ops/test_bmm.py | 2 +- backends/arm/test/ops/test_depthwise_conv.py | 2 +- backends/arm/test/ops/test_layer_norm.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/backends/arm/test/ops/test_bmm.py b/backends/arm/test/ops/test_bmm.py index 2cf90b2119..523a90cdc8 100644 --- a/backends/arm/test/ops/test_bmm.py +++ b/backends/arm/test/ops/test_bmm.py @@ -161,7 +161,7 @@ def test_bmm_u85_BI(self, operand1: torch.Tensor, operand2: torch.Tensor): ) @parameterized.expand(BMM.test_parameters[1:]) - @unittest.expectedFailure + @common.expectedFailureOnFVP def test_bmm_u85_BI_xfails(self, operand1: torch.Tensor, operand2: torch.Tensor): test_data = (operand1, operand2) self._test_bmm_ethosu_BI_pipeline( diff --git a/backends/arm/test/ops/test_depthwise_conv.py b/backends/arm/test/ops/test_depthwise_conv.py index 628b25c259..d753245f43 100644 --- a/backends/arm/test/ops/test_depthwise_conv.py +++ b/backends/arm/test/ops/test_depthwise_conv.py @@ -260,7 +260,7 @@ def test_dw_conv_tosa_BI(self, test_name: str, model: torch.nn.Module): ) # Works @parameterized.expand(testsuite_conv2d, skip_on_empty=True) - @common.expectedFailureOnFVP + @unittest.expectedFailure def test_dw_conv2d_u55_BI( self, test_name: str, model: torch.nn.Module, set_quantize_io: bool = False ): diff --git a/backends/arm/test/ops/test_layer_norm.py b/backends/arm/test/ops/test_layer_norm.py index 7375a25383..e84dd4ee58 100644 --- a/backends/arm/test/ops/test_layer_norm.py +++ b/backends/arm/test/ops/test_layer_norm.py @@ -158,7 +158,7 @@ def test_layer_norm_tosa_BI( # Numerical issues on FVP likely due to mul op, MLETORCH-521 # Skip tests that require transposes. @parameterized.expand(test_data_suite[:-2]) - @common.expectedFailureOnFVP + @unittest.expectedFailure def test_layer_norm_u55_BI( self, test_name: str, From 2f61fbb5ebae4db9b8f9c85d3588abc3392c4d77 Mon Sep 17 00:00:00 2001 From: Zingo Andersen Date: Tue, 15 Oct 2024 15:39:22 +0200 Subject: [PATCH 20/27] Arm backend: Updated toolchain to arm-gnu-toolchain-13.3.rel1 Updated toolchain for no other reason then to get all general improvements. Signed-off-by: Zingo Andersen Change-Id: If65f3986a0011e99f9b0c57bdb072dce6edb97ef --- examples/arm/setup.sh | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/examples/arm/setup.sh b/examples/arm/setup.sh index 84f2371466..6f619ef058 100755 --- a/examples/arm/setup.sh +++ b/examples/arm/setup.sh @@ -55,9 +55,9 @@ if [[ "${ARCH}" == "x86_64" ]]; then corstone320_md5_checksum="3deb3c68f9b2d145833f15374203514d" # toochain - toolchain_url="https://armkeil.blob.core.windows.net/developer/Files/downloads/gnu/12.3.rel1/binrel/arm-gnu-toolchain-12.3.rel1-x86_64-arm-none-eabi.tar.xz" - toolchain_dir="arm-gnu-toolchain-12.3.rel1-x86_64-arm-none-eabi" - toolchain_md5_checksum="00ebb1b70b1f88906c61206457eacb61" + toolchain_url="https://armkeil.blob.core.windows.net/developer/Files/downloads/gnu/13.3.rel1/binrel/arm-gnu-toolchain-13.3.rel1-x86_64-arm-none-eabi.tar.xz" + toolchain_dir="arm-gnu-toolchain-13.3.rel1-x86_64-arm-none-eabi" + toolchain_md5_checksum="0601a9588bc5b9c99ad2b56133b7f118" elif [[ "${ARCH}" == "aarch64" ]] || [[ "${ARCH}" == "arm64" ]]; then # FVPs corstone300_url="https://developer.arm.com/-/media/Arm%20Developer%20Community/Downloads/OSS/FVP/Corstone-300/FVP_Corstone_SSE-300_11.22_20_Linux64_armv8l.tgz?rev=9cc6e9a32bb947ca9b21fa162144cb01&hash=7657A4CF27D42E892E3F08D452AAB073" @@ -70,13 +70,13 @@ elif [[ "${ARCH}" == "aarch64" ]] || [[ "${ARCH}" == "arm64" ]]; then # toochain if [[ "${OS}" == "Darwin" ]]; then - toolchain_url="https://armkeil.blob.core.windows.net/developer/Files/downloads/gnu/12.3.rel1/binrel/arm-gnu-toolchain-12.3.rel1-darwin-arm64-arm-none-eabi.tar.xz" - toolchain_dir="arm-gnu-toolchain-12.3.rel1-darwin-arm64-arm-none-eabi" - toolchain_md5_checksum="53d034e9423e7f470acc5ed2a066758e" + toolchain_url="https://armkeil.blob.core.windows.net/developer/Files/downloads/gnu/13.3.rel1/binrel/arm-gnu-toolchain-13.3.rel1-darwin-arm64-arm-none-eabi.tar.xz" + toolchain_dir="arm-gnu-toolchain-13.3.rel1-darwin-arm64-arm-none-eabi" + toolchain_md5_checksum="f1c18320bb3121fa89dca11399273f4e" elif [[ "${OS}" == "Linux" ]]; then - toolchain_url="https://armkeil.blob.core.windows.net/developer/Files/downloads/gnu/12.3.rel1/binrel/arm-gnu-toolchain-12.3.rel1-aarch64-arm-none-eabi.tar.xz" - toolchain_dir="arm-gnu-toolchain-12.3.rel1-aarch64-arm-none-eabi" - toolchain_md5_checksum="02c9b0d3bb1110575877d8eee1f223f2" + toolchain_url="https://armkeil.blob.core.windows.net/developer/Files/downloads/gnu/13.3.rel1/binrel/arm-gnu-toolchain-13.3.rel1-aarch64-arm-none-eabi.tar.xz" + toolchain_dir="arm-gnu-toolchain-13.3.rel1-aarch64-arm-none-eabi" + toolchain_md5_checksum="303102d97b877ebbeb36b3158994b218" fi else echo "[main] Error: only x86-64 & aarch64/arm64 architecture is supported for now!"; exit 1; @@ -178,15 +178,15 @@ function setup_fvp() { function setup_toolchain() { # Download and install the arm-none-eabi toolchain cd "${root_dir}" - if [[ ! -e gcc.tar.xz ]]; then + if [[ ! -e "${toolchain_dir}.tar.xz" ]]; then echo "[${FUNCNAME[0]}] Downloading toolchain ..." - curl --output gcc.tar.xz "${toolchain_url}" - verify_md5 ${toolchain_md5_checksum} gcc.tar.xz + curl --output "${toolchain_dir}.tar.xz" "${toolchain_url}" + verify_md5 ${toolchain_md5_checksum} "${toolchain_dir}.tar.xz" fi echo "[${FUNCNAME[0]}] Installing toolchain ..." rm -rf "${toolchain_dir}" - tar xf gcc.tar.xz + tar xf "${toolchain_dir}.tar.xz" toolchain_bin_path="$(cd ${toolchain_dir}/bin && pwd)" export PATH=${PATH}:${toolchain_bin_path} hash arm-none-eabi-gcc From fc50da13d4ec2051a8db12cf9190669e9cd8cbee Mon Sep 17 00:00:00 2001 From: Eashan Garg Date: Wed, 20 Nov 2024 22:00:29 -0800 Subject: [PATCH 21/27] Buckify arm/test files Summary: Buckify non-test arm files, to allow ArmTester to be used internally Differential Revision: D66283212 --- backends/arm/TARGETS | 11 +++++++++++ backends/arm/test/TARGETS | 23 +++++++++++++++++++++++ 2 files changed, 34 insertions(+) create mode 100644 backends/arm/test/TARGETS diff --git a/backends/arm/TARGETS b/backends/arm/TARGETS index bd42710d7b..05f6095c37 100644 --- a/backends/arm/TARGETS +++ b/backends/arm/TARGETS @@ -110,3 +110,14 @@ python_library( "//executorch/backends/arm/operators:node_visitor", ], ) + +python_library( + name = "arm_model_evaluator", + src = [ + "util/arm_model_evaluator.py", + ], + typing = True, + deps = [ + "//caffe2:torch", + ] +) diff --git a/backends/arm/test/TARGETS b/backends/arm/test/TARGETS new file mode 100644 index 0000000000..ef092c5503 --- /dev/null +++ b/backends/arm/test/TARGETS @@ -0,0 +1,23 @@ +load("@fbcode_macros//build_defs:python_library.bzl", "python_library") + +python_library( + name = "common", + srcs = ["common.py"], + deps = [ + "//executorch/backends/xnnpack/test/tester:tester", + "//executorch/backends/arm:arm_backend", + "//executorch/exir:lib", + "//executorch/exir/backend:compile_spec_schema", + ] +) + +python_library( + name = "runner_utils", + srcs = ["runner_utils.py"], + deps = [ + "//executorch/backends/xnnpack/test/tester:tester", + "//executorch/backends/arm:arm_backend", + "//executorch/exir:lib", + "//executorch/exir/backend:compile_spec_schema", + ] +) From 12281264631969c1936cafebeba6e6b403e4cb72 Mon Sep 17 00:00:00 2001 From: Benjamin Klimczak Date: Mon, 11 Nov 2024 15:37:23 +0000 Subject: [PATCH 22/27] Add support for torch.ops.aten._to_copy.default Lower torch.ops.aten._to_copy.default to TOSA CAST op. This resolves issues around arithmetic operators when using int scalars in unquantized networks (see new test cases in test_scalars.py). Note: Parameter 'memory_format' is not supported. Change-Id: I7a921ca510c5b46f15b5399218f9230ba0f93d88 --- backends/arm/operator_support/__init__.py | 1 + .../arm/operator_support/to_copy_support.py | 120 ++++++++++++++++++ backends/arm/operators/__init__.py | 1 + backends/arm/operators/op_to_copy.py | 43 +++++++ backends/arm/test/ops/test_scalars.py | 16 ++- backends/arm/test/ops/test_to_copy.py | 70 ++++++++++ 6 files changed, 249 insertions(+), 2 deletions(-) create mode 100644 backends/arm/operator_support/to_copy_support.py create mode 100644 backends/arm/operators/op_to_copy.py create mode 100644 backends/arm/test/ops/test_to_copy.py diff --git a/backends/arm/operator_support/__init__.py b/backends/arm/operator_support/__init__.py index c133ce8003..297047963c 100644 --- a/backends/arm/operator_support/__init__.py +++ b/backends/arm/operator_support/__init__.py @@ -8,6 +8,7 @@ from . import ( # noqa mean_dim_support, right_shift_support, + to_copy_support, tosa_supported_operators, var_correction_support, ) diff --git a/backends/arm/operator_support/to_copy_support.py b/backends/arm/operator_support/to_copy_support.py new file mode 100644 index 0000000000..9bba274804 --- /dev/null +++ b/backends/arm/operator_support/to_copy_support.py @@ -0,0 +1,120 @@ +# Copyright 2024 Arm Limited and/or its affiliates. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +# pyre-unsafe +import logging + +import torch + +import torch.fx as fx + +from executorch.backends.arm.operator_support.tosa_supported_operators import ( + register_tosa_support_check, + SupportedTOSAOperatorCheck, +) +from executorch.backends.arm.tosa_specification import TosaSpecification +from executorch.exir.dialects._ops import ops as exir_ops + +logger = logging.getLogger(__name__) + + +@register_tosa_support_check +class ToCopySupported(SupportedTOSAOperatorCheck): + targets = [exir_ops.edge.aten._to_copy.default] + + tosa_specs = [ + TosaSpecification.create_from_string("TOSA-0.80.0+BI"), + TosaSpecification.create_from_string("TOSA-0.80.0+MI"), + ] + + SupportedTypeDict = dict[torch.dtype, list[torch.dtype]] + + @staticmethod + def _merge_supported_types( + dtypes1: SupportedTypeDict, dtypes2: SupportedTypeDict + ) -> SupportedTypeDict: + merged_dtypes = dtypes1 + for k, v in dtypes2.items(): + merged_dtypes[k] = merged_dtypes.get(k, []) + v + return merged_dtypes + + SUPPORTED_INT_TYPES: SupportedTypeDict = { + torch.bool: [torch.int8, torch.int16, torch.int32], + torch.int8: [torch.bool, torch.int16, torch.int32], + torch.int16: [torch.bool, torch.int8, torch.int32], + torch.int32: [torch.bool, torch.int8, torch.int16], + } + SUPPORTED_FLOAT_TYPES: SupportedTypeDict = { + torch.int8: [torch.float16, torch.bfloat16, torch.float32], + torch.int16: [torch.float16, torch.bfloat16, torch.float32], + torch.int32: [torch.float16, torch.bfloat16, torch.float32], + torch.bfloat16: [torch.int8, torch.int16, torch.int32, torch.float32], + torch.float16: [torch.int8, torch.int16, torch.int32, torch.float32], + torch.float32: [ + torch.int8, + torch.int16, + torch.int32, + torch.bfloat16, + torch.float16, + ], + } + ALL_SUPPORTED_TYPES = _merge_supported_types( + SUPPORTED_INT_TYPES, SUPPORTED_FLOAT_TYPES + ) + POSSIBLE_TYPE_CONVERSIONS = {torch.int64: torch.int32} + + def is_node_supported(self, node: fx.Node, tosa_spec: TosaSpecification) -> bool: + assert node.target in self.targets + + if tosa_spec not in self.tosa_specs: + return False + + assert tosa_spec.support_integer() + supported_dtypes = ( + self.ALL_SUPPORTED_TYPES + if tosa_spec.support_float() + else self.SUPPORTED_INT_TYPES + ) + # Take into account possible type conversions + supported_dtypes.update( + (k, supported_dtypes[v]) + for k, v in self.POSSIBLE_TYPE_CONVERSIONS.items() + if v in supported_dtypes + ) + + # Check input type + assert len(node.all_input_nodes) == 1 + input_val = node.all_input_nodes[0].meta["val"] + assert isinstance(input_val, torch._subclasses.FakeTensor) + input_dtype = input_val.dtype + if input_dtype not in supported_dtypes: + logger.info( + f"Input dtype {input_val.dtype} is not supported in " + f"{node.target.name()}." + ) + return False + + # Check output type + output_val = node.meta["val"] + assert isinstance(output_val, torch._subclasses.FakeTensor) + if output_val.dtype not in supported_dtypes[input_dtype]: + logger.info( + f"Output dtype {output_val.dtype} is not supported in " + f"{node.target.name()} for input dtype {input_dtype}. " + f"Supported output types: " + f"{''.join(str(t) for t in supported_dtypes[input_dtype])}" + ) + return False + + # Check memory format + if "memory_format" in node.kwargs: + if node.kwargs["memory_format"] in (torch.preserve_format,): + logger.info( + f"Argument 'memory_format' is not supported for " + f"{node.target.name()} right now." + ) + return False + + return True diff --git a/backends/arm/operators/__init__.py b/backends/arm/operators/__init__.py index a5c2dd8dc5..8c4aa85e57 100644 --- a/backends/arm/operators/__init__.py +++ b/backends/arm/operators/__init__.py @@ -36,6 +36,7 @@ op_sub, op_sum, op_tanh, + op_to_copy, op_transpose, op_unsqueeze, op_upsample_nearest2d, diff --git a/backends/arm/operators/op_to_copy.py b/backends/arm/operators/op_to_copy.py new file mode 100644 index 0000000000..15077d6df7 --- /dev/null +++ b/backends/arm/operators/op_to_copy.py @@ -0,0 +1,43 @@ +# Copyright 2024 Arm Limited and/or its affiliates. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +# pyre-unsafe +from typing import List + +import serializer.tosa_serializer as ts +import torch +import tosa.Op as TosaOp + +from executorch.backends.arm.operators.node_visitor import ( + NodeVisitor, + register_node_visitor, +) +from executorch.backends.arm.tosa_mapping import TosaArg + + +@register_node_visitor +class ToCopyVisitor(NodeVisitor): + """ + Implement the type cast functionality of _to_copy. + + Other features like setting of the memory_format or moving a tensor to a + different device are not supported. + + Also note that the node should not be quantized. + """ + + target = "aten._to_copy.default" + + def define_node( + self, + node: torch.fx.Node, + tosa_graph: ts.TosaSerializer, + inputs: List[TosaArg], + output: TosaArg, + is_quant_node: bool, + ) -> None: + assert not is_quant_node, "Casting of quantized values is not supported." + assert inputs + tosa_graph.addOperator(TosaOp.Op().CAST, [inputs[0].name], [output.name]) diff --git a/backends/arm/test/ops/test_scalars.py b/backends/arm/test/ops/test_scalars.py index cd3dd72f60..455b484b94 100644 --- a/backends/arm/test/ops/test_scalars.py +++ b/backends/arm/test/ops/test_scalars.py @@ -153,9 +153,21 @@ def _test_add_tosa_BI_pipeline(self, module: torch.nn.Module, test_data: tuple): .run_method_and_compare_outputs(inputs=test_data) ) - # Most MI tests fail, just show one working for now. - @parameterized.expand((tensor_scalar_tests[6],)) + @parameterized.expand(tensor_scalar_tests) def test_MI(self, test_name: str, op: torch.nn.Module, x, y): + expected_exception = None + if any(token in test_name for token in ("Sub_int", "Sub__int")): + expected_exception = RuntimeError + elif test_name.endswith("_st"): + expected_exception = AttributeError + + if expected_exception: + with self.assertRaises( + expected_exception, msg=f"Test {test_name} is expected to fail." + ): + self._test_add_tosa_MI_pipeline(op, (x, y)) + return + self._test_add_tosa_MI_pipeline(op, (x, y)) # op(Scalar float, tensor) works if the scalar is constant. diff --git a/backends/arm/test/ops/test_to_copy.py b/backends/arm/test/ops/test_to_copy.py new file mode 100644 index 0000000000..8499512e10 --- /dev/null +++ b/backends/arm/test/ops/test_to_copy.py @@ -0,0 +1,70 @@ +# Copyright 2024 Arm Limited and/or its affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +# +# Tests the _to_copy op which is interpreted as a cast for our purposes. +# + +import unittest + +import torch + +from executorch.backends.arm.test import common +from executorch.backends.arm.test.tester.arm_tester import ArmTester + +from parameterized import parameterized + + +class Cast(torch.nn.Module): + def __init__(self, target_dtype): + super().__init__() + self.target_dtype = target_dtype + + def forward(self, x: torch.Tensor): + return x.to(dtype=self.target_dtype) + + +class TestToCopy(unittest.TestCase): + """ + Tests the _to_copy operation. + + Only test unquantized graphs as explicit casting of dtypes messes with the + quantization. + + Note: This is also covered by test_scalars.py. + """ + + _TO_COPY_TEST_DATA = ( + (torch.rand((1, 2, 3, 4), dtype=torch.float16), torch.float32), + (torch.rand((1, 2, 3, 4), dtype=torch.float32), torch.float16), + (torch.randint(-127, 128, (1, 2, 3, 4), dtype=torch.int8), torch.float32), + (torch.randint(-127, 128, (1, 2, 3, 4), dtype=torch.int8), torch.int32), + (torch.randint(-127, 128, (1, 2, 3, 4), dtype=torch.int32), torch.int8), + ) + + def _test_to_copy_tosa_MI_pipeline( + self, module: torch.nn.Module, test_data: torch.Tensor + ): + ( + ArmTester( + module, + example_inputs=test_data, + compile_spec=common.get_tosa_compile_spec("TOSA-0.80.0+MI"), + ) + .export() + .dump_artifact() + .check_count({"torch.ops.aten._to_copy.default": 1}) + .to_edge() + .dump_artifact() + .partition() + .check_count({"torch.ops.higher_order.executorch_call_delegate": 1}) + .to_executorch() + .run_method_and_compare_outputs(inputs=test_data) + ) + + @parameterized.expand(_TO_COPY_TEST_DATA) + def test_view_tosa_MI(self, test_tensor: torch.Tensor, new_dtype): + self._test_to_copy_tosa_MI_pipeline(Cast(new_dtype), (test_tensor,)) From 3475707dcdba611818db6bebafec4cc5691b3499 Mon Sep 17 00:00:00 2001 From: AIWintermuteAI <32562299+AIWintermuteAI@users.noreply.github.com> Date: Thu, 28 Nov 2024 17:24:29 +0100 Subject: [PATCH 23/27] Update run.sh to use arm-none-eabi-size --- examples/arm/run.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/arm/run.sh b/examples/arm/run.sh index 0e5fa9db34..cbc96c4b11 100755 --- a/examples/arm/run.sh +++ b/examples/arm/run.sh @@ -213,9 +213,9 @@ function build_executorch_runner() { cmake --build ${executor_runner_path}/cmake-out --parallel -- arm_executor_runner echo "[${FUNCNAME[0]}] Generated baremetal elf file:" find ${executor_runner_path}/cmake-out -name "arm_executor_runner" - echo "executable_text: $(find ${executor_runner_path}/cmake-out -name arm_executor_runner -exec size {} \; | grep -v filename | awk '{print $1}') bytes" - echo "executable_data: $(find ${executor_runner_path}/cmake-out -name arm_executor_runner -exec size {} \; | grep -v filename | awk '{print $2}') bytes" - echo "executable_bss: $(find ${executor_runner_path}/cmake-out -name arm_executor_runner -exec size {} \; | grep -v filename | awk '{print $3}') bytes" + echo "executable_text: $(find ${executor_runner_path}/cmake-out -name arm_executor_runner -exec arm-none-eabi-size {} \; | grep -v filename | awk '{print $1}') bytes" + echo "executable_data: $(find ${executor_runner_path}/cmake-out -name arm_executor_runner -exec arm-none-eabi-size {} \; | grep -v filename | awk '{print $2}') bytes" + echo "executable_bss: $(find ${executor_runner_path}/cmake-out -name arm_executor_runner -exec arm-none-eabi-size {} \; | grep -v filename | awk '{print $3}') bytes" } # Execute the executor_runner on FVP Simulator From 8af65d35c341a8ae74f250d7f9885f0ad9f3b33a Mon Sep 17 00:00:00 2001 From: Saoirse Stewart Date: Thu, 28 Nov 2024 15:14:00 +0000 Subject: [PATCH 24/27] Update the ArmBackend to check the total amount of dimensions of the output tensors * Adding multiple output sample model to arm_aot_compiler --- backends/arm/runtime/ArmBackendEthosU.cpp | 72 +++++++++++++++-------- examples/arm/aot_arm_compiler.py | 10 ++++ 2 files changed, 57 insertions(+), 25 deletions(-) diff --git a/backends/arm/runtime/ArmBackendEthosU.cpp b/backends/arm/runtime/ArmBackendEthosU.cpp index 99ce0a9df2..a14c42140e 100644 --- a/backends/arm/runtime/ArmBackendEthosU.cpp +++ b/backends/arm/runtime/ArmBackendEthosU.cpp @@ -138,6 +138,7 @@ class ArmBackend final : public ::executorch::runtime::BackendInterface { // TODO(MLETORCH-123): Optimise into direct write from Vela into the SRAM // or DRAM output for compatible data layouts. for (int i = 0; i < handles.inputs->count; i++) { + auto tensor_count = 1, io_count = 1; auto tensor_in = args[i]->toTensor(); char* scratch_addr = handles.scratch_data + handles.inputs->io[i].offset; @@ -202,6 +203,19 @@ class ArmBackend final : public ::executorch::runtime::BackendInterface { ET_LOG(Error, "No matching input copy routine"); return Error::InvalidProgram; } + if (!permuted_input_shape) { + calculate_dimensions( + tensor_in, &handles.inputs->io[i], &tensor_count, &io_count); + if (tensor_count != io_count) { + ET_LOG(Error, "Input tensor sizes do not match"); + ET_LOG( + Error, + "Program expects %d elements but got %d", + io_count, + tensor_count); + return Error::InvalidProgram; + } + } } // Allocate driver handle and synchronously invoke driver @@ -236,14 +250,24 @@ class ArmBackend final : public ::executorch::runtime::BackendInterface { result); return Error::InvalidProgram; } - + int tensor_dim = 0, io_dim = 0; // Write outputs from scratch into EValue pointers for (int i = 0; i < handles.outputs->count; i++) { + int tensor_count = 1, io_count = 1; const char* output_addr = handles.scratch_data + handles.outputs->io[i].offset; // Process input EValue into scratch // Outputs are in the index immediately after inputs auto tensor_out = args[handles.inputs->count + i]->toTensor(); + + calculate_dimensions( + tensor_out, &handles.outputs->io[i], &tensor_count, &io_count); + + // At times the topological order of the outputs may change. + // Lets instead ensure that the sum of dimensions match. + tensor_dim = tensor_dim + tensor_count; + io_dim = io_dim + io_count; + bool permuted_output_shape; ET_CHECK_OK_OR_RETURN_ERROR(check_requires_permute( i, @@ -272,6 +296,12 @@ class ArmBackend final : public ::executorch::runtime::BackendInterface { } } } + if (tensor_dim != io_dim) { + ET_LOG(Error, "Total output tensor sizes do not match"); + ET_LOG( + Error, "Program expects size of %d but got %d", tensor_dim, io_dim); + return Error::InvalidProgram; + } return Error::Ok; } @@ -280,6 +310,21 @@ class ArmBackend final : public ::executorch::runtime::BackendInterface { } private: + void calculate_dimensions( + const executorch::aten::Tensor tensor, + VelaIO* io, + int* tensor_count, + int* io_count) const { + for (int i = 0; i < tensor.dim(); i++) { + *tensor_count = *tensor_count * tensor.size(i); + } + + // The VelaIO type has a shape of fixed size 4 + for (int i = 0; i < 4; i++) { + *io_count = *io_count * io->shape[i]; + } + } + Error check_requires_permute( int index, const executorch::aten::Tensor tensor, @@ -287,6 +332,7 @@ class ArmBackend final : public ::executorch::runtime::BackendInterface { bool permuted_io_flag, bool* is_permuted) const { bool permuted_shape = false; + if (tensor.dim() == 4) { // special case for NHWC workaround in AOT; as the compilation has // permuted to channel last in an undetectable way, we assume here @@ -304,30 +350,6 @@ class ArmBackend final : public ::executorch::runtime::BackendInterface { return Error::InvalidProgram; } } - if (!permuted_shape) { - // Check the number of elements in each tensor match - int tensor_count = 1; - int io_count = 1; - - for (int i = 0; i < tensor.dim(); i++) { - tensor_count = tensor_count * tensor.size(i); - } - - // The VelaIO type has a shape of fixed size 4 - for (int i = 0; i < 4; i++) { - io_count = io_count * io->shape[i]; - } - - if (tensor_count != io_count) { - ET_LOG(Error, "Input tensor sizes do not match"); - ET_LOG( - Error, - "Program expects %d elements but got %d", - io_count, - tensor_count); - return Error::InvalidProgram; - } - } *is_permuted = permuted_shape; return Error::Ok; } diff --git a/examples/arm/aot_arm_compiler.py b/examples/arm/aot_arm_compiler.py index a16d947dd6..6d899c2146 100644 --- a/examples/arm/aot_arm_compiler.py +++ b/examples/arm/aot_arm_compiler.py @@ -172,11 +172,21 @@ def forward(self, x): can_delegate = False +class MultipleOutputsModule(torch.nn.Module): + + def forward(self, x: torch.Tensor, y: torch.Tensor): + return (x * y, x.sum(dim=-1, keepdim=True)) + + example_input = (torch.randn(10, 4, 5), torch.randn(10, 4, 5)) + can_delegate = True + + models = { "add": AddModule, "add2": AddModule2, "add3": AddModule3, "softmax": SoftmaxModule, + "MultipleOutputsModule": MultipleOutputsModule, } calibration_data = { From 1c9abfa6fa33fd365d51919669d159c4babf8057 Mon Sep 17 00:00:00 2001 From: Adrian Lundell Date: Wed, 13 Nov 2024 11:08:59 +0100 Subject: [PATCH 25/27] [Arm backend] Support keep_dims == True for meandim and var ops - Adds keepdim support in decompose_var/ decompose_meandim passes - Renames insert_squeeze_after_sum to more general name for future ops - Adds get/set_node_args help functions - Updates TOSASupportedOperators Change-Id: Ifda19d1c3ed67d03d0c896bf4f74253d875354cc --- backends/arm/_passes/arm_pass_manager.py | 6 +- backends/arm/_passes/arm_pass_utils.py | 58 +++++++++++++++++++ .../arm/_passes/decompose_meandim_pass.py | 13 +++-- backends/arm/_passes/decompose_var_pass.py | 27 +++++---- ....py => keep_dims_false_to_squeeze_pass.py} | 42 +++++++++++--- backends/arm/operator_support/__init__.py | 8 +-- .../arm/operator_support/mean_dim_support.py | 33 ----------- .../tosa_supported_operators.py | 3 + .../var_correction_support.py | 33 ----------- backends/arm/test/ops/test_mean_dim.py | 4 +- backends/arm/test/ops/test_var.py | 8 +-- .../passes/test_meandim_to_averagepool2d.py | 8 ++- 12 files changed, 135 insertions(+), 108 deletions(-) rename backends/arm/_passes/{insert_squeeze_after_sum_pass.py => keep_dims_false_to_squeeze_pass.py} (58%) delete mode 100644 backends/arm/operator_support/mean_dim_support.py delete mode 100644 backends/arm/operator_support/var_correction_support.py diff --git a/backends/arm/_passes/arm_pass_manager.py b/backends/arm/_passes/arm_pass_manager.py index a72cdfd1a0..1e2b26ef64 100644 --- a/backends/arm/_passes/arm_pass_manager.py +++ b/backends/arm/_passes/arm_pass_manager.py @@ -29,8 +29,8 @@ DecomposeSoftmaxesPass, ) from executorch.backends.arm._passes.decompose_var_pass import DecomposeVarPass -from executorch.backends.arm._passes.insert_squeeze_after_sum_pass import ( - InsertSqueezeAfterSumPass, +from executorch.backends.arm._passes.keep_dims_false_to_squeeze_pass import ( + KeepDimsFalseToSqueezePass, ) from executorch.backends.arm._passes.match_arg_ranks_pass import MatchArgRanksPass from executorch.backends.arm._passes.meandim_to_averagepool_pass import ( @@ -71,7 +71,7 @@ def transform_to_backend_pipeline( self.add_pass(DecomposeMeanDimPass()) self.add_pass(MatchArgRanksPass(exported_program)) self.add_pass(DecomposeDivPass()) - self.add_pass(InsertSqueezeAfterSumPass()) + self.add_pass(KeepDimsFalseToSqueezePass()) self.add_pass(ConvertSplitToSlicePass()) self.add_pass(Conv1dUnsqueezePass(exported_program)) self.add_pass(DecomposeSoftmaxesPass()) diff --git a/backends/arm/_passes/arm_pass_utils.py b/backends/arm/_passes/arm_pass_utils.py index 3fcf724e5b..78ee6e265c 100644 --- a/backends/arm/_passes/arm_pass_utils.py +++ b/backends/arm/_passes/arm_pass_utils.py @@ -7,6 +7,7 @@ # pyre-unsafe +from inspect import isclass from typing import Optional import torch @@ -133,3 +134,60 @@ def get_first_fake_tensor(node: torch.fx.Node) -> FakeTensor: fake_tensor, FakeTensor ), f'Found {fake_tensor} in meta["val"] of {node}, expected to find FakeTensor.' return fake_tensor + + +def get_node_arg(args: list | dict, key: int | str | type, default_value=None): + """ + Help-function for getting a value from node.args/ kwargs, three cases: + 1. By position in node.args - Returns arg at given position or default_value if index is one out of bounds + 2. By key in node.kwargs - Returns kwarg with given key or default_value if it deos not exist + 3. By type in node.args - Returns first arg of args of given type. Useful for cases where arg postions may differ but types are unique. + """ + if isinstance(key, int): + if 0 <= key < len(args): + return args[key] + elif key == len(args): + if default_value is not None: + return default_value + else: + raise RuntimeError(f"No defult value given for index {key}") + else: + raise RuntimeError( + f"Out of bounds index {key} for getting value in args (of size {len(args)})" + ) + elif isinstance(key, str): + return args.get(key, default_value) + elif isclass(key): + for arg in args: + if isinstance(arg, key): + return arg + if default_value is not None: + return default_value + else: + raise RuntimeError(f"No arg of type {key}") + else: + raise RuntimeError("Invalid type") + + +def set_node_arg(node: torch.fx.Node, i: int | str, value): + """ + Help-function for setting a value in node.args/ kwargs. If the index is one larger than the list size, the value is instead appended to the list. + """ + if isinstance(i, int): + if 0 <= i < len(node.args): + args = list(node.args) + args[i] = value + node.args = tuple(args) + return + elif i == len(node.args): + node.args = node.args + (value,) + else: + raise RuntimeError( + f"Out of bounds index {i} for setting value in {node} args (of size {len(node.args)})" + ) + elif isinstance(i, str): + kwargs = dict(node.kwargs) + kwargs[i] = value + node.kwargs = kwargs + else: + raise RuntimeError("Invalid type") diff --git a/backends/arm/_passes/decompose_meandim_pass.py b/backends/arm/_passes/decompose_meandim_pass.py index d927fd613c..abf5c8f363 100644 --- a/backends/arm/_passes/decompose_meandim_pass.py +++ b/backends/arm/_passes/decompose_meandim_pass.py @@ -7,6 +7,7 @@ # pyre-unsafe import torch +from executorch.backends.arm._passes.arm_pass_utils import get_node_arg from executorch.exir.dialects._ops import ops as exir_ops from executorch.exir.pass_base import ExportPass @@ -42,16 +43,16 @@ def call_operator(self, op, args, kwargs, meta): if op not in (exir_ops.edge.aten.mean.dim, torch.ops.aten.mean.dim): return super().call_operator(op, args, kwargs, meta) - x = args[0] - dim = args[1] - keepdim = args[2] if len(args) > 2 else False - if not keepdim: - return super().call_operator(op, args, kwargs, meta) - # if keepdim == True and dim == [-1, -2], mean.dim can be + x = get_node_arg(args, 0) + dim = get_node_arg(args, 1) + keepdim = get_node_arg(args, 2, False) + + # if dim == [-1, -2], mean.dim can be # decomposed to avg_pool2d. This is handled by ConvertMeanDimToAveragePool. if dim == [-1, -2]: # Simply return the mean.dim operator for future decomposition. return super().call_operator(op, args, kwargs, meta) + shape = meta["val"].size() dtype = meta["val"].dtype input_shape = x.data.size() diff --git a/backends/arm/_passes/decompose_var_pass.py b/backends/arm/_passes/decompose_var_pass.py index cc8f0eb6da..283760e423 100644 --- a/backends/arm/_passes/decompose_var_pass.py +++ b/backends/arm/_passes/decompose_var_pass.py @@ -8,6 +8,7 @@ import torch +from executorch.backends.arm._passes.arm_pass_utils import get_node_arg from executorch.exir.dialects._ops import ops as exir_ops from executorch.exir.pass_base import ExportPass @@ -53,26 +54,30 @@ def call_operator(self, op, args, kwargs, meta): torch.ops.aten.var.dim, ): return super().call_operator(op, args, kwargs, meta) - shape = meta["val"].size() + + x = args[0] + input_shape = x.data.size() + shape = list(meta["val"].size()) + if shape == []: + shape = [1 for _ in input_shape] + dtype = meta["val"].dtype - dim = args[1] if len(args) > 1 else list(range(len(shape))) + # Get dim from args based on argument type + dim = get_node_arg(args, key=list, default_value=list(range(len(shape)))) + if op == torch.ops.aten.var.dim: - correction = args[-2] - keepdim = args[-1] + keepdim = get_node_arg(args, bool, False) + correction = get_node_arg(args, int, 1) else: - correction = kwargs["correction"] - keepdim = kwargs.get("keepdim", False) - if not keepdim: - return super().call_operator(op, args, kwargs, meta) + correction = get_node_arg(kwargs, "correction", 1) + keepdim = get_node_arg(kwargs, "keepdim", False) - x = args[0] - input_shape = x.data.size() N = 1 for d in dim: N *= input_shape[d] mean_op, diff_op, mul_op, sum_op, full_op = get_var_decomposition(op) - mean = super().call_operator(mean_op, (x, dim, keepdim), {}, meta) + mean = super().call_operator(mean_op, (x, dim, True), {}, meta) diff = super().call_operator(diff_op, (x, mean), {}, meta) squared_diff = super().call_operator(mul_op, (diff, diff), {}, meta) sum = super().call_operator(sum_op, (squared_diff, dim, keepdim), {}, meta) diff --git a/backends/arm/_passes/insert_squeeze_after_sum_pass.py b/backends/arm/_passes/keep_dims_false_to_squeeze_pass.py similarity index 58% rename from backends/arm/_passes/insert_squeeze_after_sum_pass.py rename to backends/arm/_passes/keep_dims_false_to_squeeze_pass.py index e088c2e35a..736c627d91 100644 --- a/backends/arm/_passes/insert_squeeze_after_sum_pass.py +++ b/backends/arm/_passes/keep_dims_false_to_squeeze_pass.py @@ -10,14 +10,18 @@ import torch import torch.fx -from executorch.backends.arm._passes.arm_pass_utils import create_node +from executorch.backends.arm._passes.arm_pass_utils import ( + create_node, + get_node_arg, + set_node_arg, +) from executorch.exir.dialects._ops import ops as exir_ops from executorch.exir.pass_base import ExportPass, PassResult -class InsertSqueezeAfterSumPass(ExportPass): +class KeepDimsFalseToSqueezePass(ExportPass): """ - In Pytorch, the default behaviour of Tensor.sum is to squeeze + In Pytorch, the default behaviour of for example Tensor.sum is to squeeze the dimension that is summed (keep_dim = False). However, in TOSA, REDUCE_SUM always preserves the rank of the input (keep_dim = True). @@ -31,21 +35,44 @@ class InsertSqueezeAfterSumPass(ExportPass): squeeze(dim = dims) """ + # CURRENTLY NOT HANDLED OPS + # exir_ops.edge.aten.amax, + # exir_ops.edge.aten.amin, + # exir_ops.edge.aten.any.dim, + # exir_ops.edge.aten.any.dims, + # exir_ops.edge.aten.argmax, + # exir_ops.edge.aten.argmin, + # exir_ops.edge.aten.max.dim, + # exir_ops.edge.aten.min.dim, + # exir_ops.edge.aten.prod.dim_int, + + # HANDLED OPS + # exir_ops.edge.aten.sum.dim_IntList + # exir_ops.edge.aten.var.correction (decomposed in decompose_var_pass) + # exir_ops.edge.aten.var.dim (decomposed in decompose_var_pass) + # exir_ops.edge.aten.mean.dim (decomposed in decompose_meandim_pass) + def call(self, graph_module: torch.fx.GraphModule): for node in graph_module.graph.nodes: + keep_dim_index = None + if node.op != "call_function": continue - if node.target != exir_ops.edge.aten.sum.dim_IntList: + if node.target == exir_ops.edge.aten.sum.dim_IntList: + keep_dim_index = 2 + else: continue + sum_node = cast(torch.fx.Node, node) - keep_dim = cast(bool, sum_node.args[2] if len(sum_node.args) > 2 else False) + keep_dim = get_node_arg(sum_node.args, keep_dim_index, False) + if keep_dim: continue - dim_list = cast(list[int], sum_node.args[1]) + dim_list = get_node_arg(sum_node.args, 1, [0]) # Add keep_dim = True arg to sum node. - sum_node.args = sum_node.args[0:2] + (True,) + set_node_arg(sum_node, 2, True) with graph_module.graph.inserting_after(sum_node): squeeze_node = create_node( @@ -53,6 +80,7 @@ def call(self, graph_module: torch.fx.GraphModule): ) sum_node.replace_all_uses_with(squeeze_node) squeeze_node.args = (sum_node, dim_list) + graph_module.graph.eliminate_dead_code() graph_module.recompile() graph_module = super().call(graph_module).graph_module diff --git a/backends/arm/operator_support/__init__.py b/backends/arm/operator_support/__init__.py index 297047963c..08f58b1e43 100644 --- a/backends/arm/operator_support/__init__.py +++ b/backends/arm/operator_support/__init__.py @@ -5,10 +5,4 @@ # pyre-unsafe -from . import ( # noqa - mean_dim_support, - right_shift_support, - to_copy_support, - tosa_supported_operators, - var_correction_support, -) +from . import right_shift_support, to_copy_support, tosa_supported_operators # noqa diff --git a/backends/arm/operator_support/mean_dim_support.py b/backends/arm/operator_support/mean_dim_support.py deleted file mode 100644 index 67a7c20406..0000000000 --- a/backends/arm/operator_support/mean_dim_support.py +++ /dev/null @@ -1,33 +0,0 @@ -# Copyright 2024 Arm Limited and/or its affiliates. -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -# pyre-unsafe - -from typing import cast - -import torch.fx as fx - -from executorch.backends.arm.operator_support.tosa_supported_operators import ( - register_tosa_support_check, - SupportedTOSAOperatorCheck, -) -from executorch.backends.arm.tosa_specification import TosaSpecification -from executorch.exir.dialects._ops import ops as exir_ops - - -@register_tosa_support_check -class MeanDimSupported(SupportedTOSAOperatorCheck): - targets = [exir_ops.edge.aten.mean.dim] - - tosa_specs = [ - TosaSpecification.create_from_string("TOSA-0.80.0+BI"), - TosaSpecification.create_from_string("TOSA-0.80.0+MI"), - ] - - def is_node_supported(self, node: fx.Node, tosa_spec: TosaSpecification) -> bool: - assert node.target in self.targets - - keep_dim = node.args[2] if len(node.args) > 2 else False - return cast(bool, keep_dim) diff --git a/backends/arm/operator_support/tosa_supported_operators.py b/backends/arm/operator_support/tosa_supported_operators.py index 3563ee9c51..7072ba6a82 100644 --- a/backends/arm/operator_support/tosa_supported_operators.py +++ b/backends/arm/operator_support/tosa_supported_operators.py @@ -92,6 +92,7 @@ def is_node_supported(self, submodules, node: fx.Node) -> bool: exir_ops.edge.aten.avg_pool2d.default, exir_ops.edge.aten.max_pool2d_with_indices.default, exir_ops.edge.aten.sigmoid.default, + exir_ops.edge.aten.mean.dim, exir_ops.edge.aten.mm.default, exir_ops.edge.aten.repeat.default, exir_ops.edge.aten.reciprocal.default, @@ -105,6 +106,8 @@ def is_node_supported(self, submodules, node: fx.Node) -> bool: exir_ops.edge.aten.sum.dim_IntList, exir_ops.edge.aten.tanh.default, exir_ops.edge.aten.upsample_nearest2d.vec, + exir_ops.edge.aten.var.correction, + exir_ops.edge.aten.var.dim, exir_ops.edge.aten.view_copy.default, exir_ops.edge.aten.clone.default, exir_ops.edge.aten.unsqueeze_copy.default, diff --git a/backends/arm/operator_support/var_correction_support.py b/backends/arm/operator_support/var_correction_support.py deleted file mode 100644 index 4aa2ae5e97..0000000000 --- a/backends/arm/operator_support/var_correction_support.py +++ /dev/null @@ -1,33 +0,0 @@ -# Copyright 2024 Arm Limited and/or its affiliates. -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -# pyre-unsafe - -from typing import cast - -import torch.fx as fx - -from executorch.backends.arm.operator_support.tosa_supported_operators import ( - register_tosa_support_check, - SupportedTOSAOperatorCheck, -) -from executorch.backends.arm.tosa_specification import TosaSpecification -from executorch.exir.dialects._ops import ops as exir_ops - - -@register_tosa_support_check -class VarCorrectionSupported(SupportedTOSAOperatorCheck): - targets = [exir_ops.edge.aten.var.correction] - - tosa_specs = [ - TosaSpecification.create_from_string("TOSA-0.80.0+BI"), - TosaSpecification.create_from_string("TOSA-0.80.0+MI"), - ] - - def is_node_supported(self, node: fx.Node, tosa_spec: TosaSpecification) -> bool: - assert node.target in self.targets - - keep_dim = node.kwargs.get("keepdim", False) - return cast(bool, keep_dim) diff --git a/backends/arm/test/ops/test_mean_dim.py b/backends/arm/test/ops/test_mean_dim.py index 3cb8c5f815..e725eb1ef4 100644 --- a/backends/arm/test/ops/test_mean_dim.py +++ b/backends/arm/test/ops/test_mean_dim.py @@ -51,7 +51,7 @@ class MeanDim(torch.nn.Module): test_data_suite = [ # (test_name, test_data) ("zeros", torch.zeros(1, 1280, 7, 7), -1, True), - ("ones", torch.ones(1, 1280, 7, 7), (-1, 2), True), + ("ones", torch.ones(1, 1280, 7, 7), (-1, 2), False), ( "rand", torch.rand(1, 1280, 7, 7), @@ -62,7 +62,7 @@ class MeanDim(torch.nn.Module): "randn", torch.randn(1, 1280, 7, 7), (-1, -2, -3), - True, + False, ), ] diff --git a/backends/arm/test/ops/test_var.py b/backends/arm/test/ops/test_var.py index 06671848cc..727cd05393 100644 --- a/backends/arm/test/ops/test_var.py +++ b/backends/arm/test/ops/test_var.py @@ -29,9 +29,9 @@ class TestVar(unittest.TestCase): class Var(torch.nn.Module): test_parameters = [ (torch.randn(1, 50, 10, 20), True, 0), - (torch.rand(1, 50, 10), True, 0), + (torch.rand(1, 50, 10), False, 0), (torch.randn(1, 30, 15, 20), True, 1), - (torch.rand(1, 50, 10, 20), True, 0.5), + (torch.rand(1, 50, 10, 20), False, 0.5), ] def forward( @@ -45,9 +45,9 @@ def forward( class VarDim(torch.nn.Module): test_parameters = [ (torch.randn(1, 50, 10, 20), 1, True, False), - (torch.rand(1, 50, 10), -2, True, False), + (torch.rand(1, 50, 10), -2, False, False), (torch.randn(1, 30, 15, 20), -3, True, True), - (torch.rand(1, 50, 10, 20), -1, True, True), + (torch.rand(1, 50, 10, 20), -1, False, True), ] test_parameters_u55 = [ diff --git a/backends/arm/test/passes/test_meandim_to_averagepool2d.py b/backends/arm/test/passes/test_meandim_to_averagepool2d.py index 615187fb65..978a4c6fe5 100644 --- a/backends/arm/test/passes/test_meandim_to_averagepool2d.py +++ b/backends/arm/test/passes/test_meandim_to_averagepool2d.py @@ -68,8 +68,12 @@ def test_tosa_BI_meandim_no_modification(self): .quantize() .export() .to_edge() - .check(["executorch_exir_dialects_edge__ops_aten_mean_dim"]) + .check(["aten_sum_dim_int_list"]) + .check(["aten_full_default"]) + .check(["aten_mul_tensor"]) .run_passes(test_pass_stage) - .check(["executorch_exir_dialects_edge__ops_aten_mean_dim"]) + .check(["aten_sum_dim_int_list"]) + .check(["aten_full_default"]) + .check(["aten_mul_tensor"]) .check_not(["executorch_exir_dialects_edge__ops_aten_avg_pool2d_default"]) ) From a3476654d72e445680d0daea417673c4b0ddfc0d Mon Sep 17 00:00:00 2001 From: Fredrik Knutsson Date: Tue, 26 Nov 2024 09:41:53 +0100 Subject: [PATCH 26/27] Improvements to Arm backend pytest setup * Add pytest fixture to ensure randomness on tests * Move pytest setup to conftest.py to separate pytest commonalities from general ones * Minor error catching improvements * Removed pytest dependency in ArmTester Change-Id: I9132681d705c1501391f3d4603f5d6f0786db873 --- backends/arm/test/common.py | 180 ++-------------- backends/arm/test/conftest.py | 196 ++++++++++++++++++ backends/arm/test/misc/test_debug_feats.py | 8 +- .../arm/test/models/test_mobilenet_v2_arm.py | 6 +- backends/arm/test/ops/test_add.py | 4 +- backends/arm/test/ops/test_avg_pool.py | 4 +- backends/arm/test/ops/test_bmm.py | 6 +- backends/arm/test/ops/test_cat.py | 9 +- backends/arm/test/ops/test_clone.py | 4 +- backends/arm/test/ops/test_conv1d.py | 5 +- backends/arm/test/ops/test_conv2d.py | 5 +- backends/arm/test/ops/test_conv_combos.py | 4 +- backends/arm/test/ops/test_depthwise_conv.py | 6 +- backends/arm/test/ops/test_div.py | 8 +- backends/arm/test/ops/test_exp.py | 4 +- backends/arm/test/ops/test_expand.py | 8 +- backends/arm/test/ops/test_full.py | 8 +- backends/arm/test/ops/test_hardtanh.py | 4 +- backends/arm/test/ops/test_layer_norm.py | 4 +- backends/arm/test/ops/test_linear.py | 4 +- backends/arm/test/ops/test_log.py | 4 +- backends/arm/test/ops/test_max_pool.py | 14 +- backends/arm/test/ops/test_mul.py | 4 +- backends/arm/test/ops/test_permute.py | 6 +- backends/arm/test/ops/test_reciprocal.py | 4 +- backends/arm/test/ops/test_sub.py | 5 +- backends/arm/test/runner_utils.py | 4 +- backends/arm/test/tester/arm_tester.py | 10 +- examples/arm/README.md | 2 +- 29 files changed, 283 insertions(+), 247 deletions(-) create mode 100644 backends/arm/test/conftest.py diff --git a/backends/arm/test/common.py b/backends/arm/test/common.py index 17353cab31..48214a48a7 100644 --- a/backends/arm/test/common.py +++ b/backends/arm/test/common.py @@ -4,156 +4,33 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. -import logging + import os -import platform -import shutil -import subprocess -import sys + import tempfile from datetime import datetime -from enum import auto, Enum from pathlib import Path -from typing import Any - -import pytest -import torch +from conftest import is_option_enabled from executorch.backends.arm.arm_backend import ArmCompileSpecBuilder from executorch.exir.backend.compile_spec_schema import CompileSpec -class arm_test_options(Enum): - quantize_io = auto() - corstone300 = auto() - dump_path = auto() - date_format = auto() - fast_fvp = auto() - - -_test_options: dict[arm_test_options, Any] = {} - -# ==== Pytest hooks ==== - - -def pytest_addoption(parser): - parser.addoption("--arm_quantize_io", action="store_true") - parser.addoption("--arm_run_corstone300", action="store_true") - parser.addoption("--default_dump_path", default=None) - parser.addoption("--date_format", default="%d-%b-%H:%M:%S") - parser.addoption("--fast_fvp", action="store_true") - - -def pytest_configure(config): - if config.option.arm_quantize_io: - load_libquantized_ops_aot_lib() - _test_options[arm_test_options.quantize_io] = True - if config.option.arm_run_corstone300: - corstone300_exists = shutil.which("FVP_Corstone_SSE-300_Ethos-U55") - if not corstone300_exists: - raise RuntimeError( - "Tests are run with --arm_run_corstone300 but corstone300 FVP is not installed." - ) - _test_options[arm_test_options.corstone300] = True - if config.option.default_dump_path: - dump_path = Path(config.option.default_dump_path).expanduser() - if dump_path.exists() and os.path.isdir(dump_path): - _test_options[arm_test_options.dump_path] = dump_path - else: - raise RuntimeError( - f"Supplied argument 'default_dump_path={dump_path}' that does not exist or is not a directory." - ) - _test_options[arm_test_options.date_format] = config.option.date_format - _test_options[arm_test_options.fast_fvp] = config.option.fast_fvp - logging.basicConfig(level=logging.INFO, stream=sys.stdout) - - -def pytest_collection_modifyitems(config, items): - if not config.option.arm_quantize_io: - skip_if_aot_lib_not_loaded = pytest.mark.skip( - "u55 tests can only run with quantize_io=True." - ) - - for item in items: - if "u55" in item.name: - item.add_marker(skip_if_aot_lib_not_loaded) - - -def pytest_sessionstart(session): - pass - - -def pytest_sessionfinish(session, exitstatus): - if get_option(arm_test_options.dump_path): - _clean_dir( - get_option(arm_test_options.dump_path), - f"ArmTester_{get_option(arm_test_options.date_format)}.log", - ) - - -# ==== End of Pytest hooks ===== - -# ==== Custom Pytest decorators ===== - - -def expectedFailureOnFVP(test_item): - if is_option_enabled("corstone300"): - test_item.__unittest_expecting_failure__ = True - return test_item - - -# ==== End of Custom Pytest decorators ===== - - -def load_libquantized_ops_aot_lib(): - so_ext = { - "Darwin": "dylib", - "Linux": "so", - "Windows": "dll", - }.get(platform.system(), None) - - find_lib_cmd = [ - "find", - "cmake-out-aot-lib", - "-name", - f"libquantized_ops_aot_lib.{so_ext}", - ] - res = subprocess.run(find_lib_cmd, capture_output=True) - if res.returncode == 0: - library_path = res.stdout.decode().strip() - torch.ops.load_library(library_path) - - -def is_option_enabled( - option: str | arm_test_options, fail_if_not_enabled: bool = False -) -> bool: - """ - Returns whether an option is successfully enabled, i.e. if the flag was - given to pytest and the necessary requirements are available. - Implemented options are: - - corstone300. - - quantize_io. - - The optional parameter 'fail_if_not_enabled' makes the function raise - a RuntimeError instead of returning False. +def get_time_formatted_path(path: str, log_prefix: str) -> str: """ - if isinstance(option, str): - option = arm_test_options[option.lower()] - - if option in _test_options and _test_options[option]: - return True - else: - if fail_if_not_enabled: - raise RuntimeError(f"Required option '{option}' for test is not enabled") - else: - return False + Returns the log path with the current time appended to it. Used for debugging. + Args: + path: The path to the folder where the log file will be stored. + log_prefix: The name of the test. -def get_option(option: arm_test_options) -> Any | None: - if option in _test_options: - return _test_options[option] - return None + Example output: + './my_log_folder/test_BI_artifact_28-Nov-14:14:38.log' + """ + return str( + Path(path) / f"{log_prefix}_{datetime.now().strftime('%d-%b-%H:%M:%S')}.log" + ) def maybe_get_tosa_collate_path() -> str | None: @@ -303,35 +180,6 @@ def get_u85_compile_spec_unbuilt( return compile_spec -def current_time_formated() -> str: - """Return current time as a formated string""" - return datetime.now().strftime(get_option(arm_test_options.date_format)) - - -def _clean_dir(dir: Path, filter: str, num_save=10): - sorted_files: list[tuple[datetime, Path]] = [] - for file in dir.iterdir(): - try: - creation_time = datetime.strptime(file.name, filter) - insert_index = -1 - for i, to_compare in enumerate(sorted_files): - compare_time = to_compare[0] - if creation_time < compare_time: - insert_index = i - break - if insert_index == -1 and len(sorted_files) < num_save: - sorted_files.append((creation_time, file)) - else: - sorted_files.insert(insert_index, (creation_time, file)) - except ValueError: - continue - - if len(sorted_files) > num_save: - for remove in sorted_files[0 : len(sorted_files) - num_save]: - file = remove[1] - file.unlink() - - def get_target_board(compile_spec: list[CompileSpec]) -> str | None: for spec in compile_spec: if spec.key == "compile_flags": diff --git a/backends/arm/test/conftest.py b/backends/arm/test/conftest.py new file mode 100644 index 0000000000..a94adb9a89 --- /dev/null +++ b/backends/arm/test/conftest.py @@ -0,0 +1,196 @@ +# Copyright 2024 Arm Limited and/or its affiliates. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import logging +import os +import platform +import random +import re +import shutil +import subprocess +import sys +from enum import auto, Enum +from typing import Any + +import pytest +import torch + +""" +This file contains the pytest hooks, fixtures etc. for the Arm test suite. +""" + + +class arm_test_options(Enum): + quantize_io = auto() + corstone_fvp = auto() + fast_fvp = auto() + + +_test_options: dict[arm_test_options, Any] = {} + +# ==== Pytest hooks ==== + + +def pytest_configure(config): + if config.option.arm_quantize_io: + _load_libquantized_ops_aot_lib() + _test_options[arm_test_options.quantize_io] = True + if config.option.arm_run_corstoneFVP: + corstone300_exists = shutil.which("FVP_Corstone_SSE-300_Ethos-U55") + corstone320_exists = shutil.which("FVP_Corstone_SSE-320") + if not (corstone300_exists and corstone320_exists): + raise RuntimeError( + "Tests are run with --arm_run_corstoneFVP but corstone FVP is not installed." + ) + _test_options[arm_test_options.corstone_fvp] = True + _test_options[arm_test_options.fast_fvp] = config.option.fast_fvp + logging.basicConfig(level=logging.INFO, stream=sys.stdout) + + +def pytest_collection_modifyitems(config, items): + """ + Skip all tests that require run on Ethos-U if the option arm_quantize_io is + not set. + """ + if not config.option.arm_quantize_io: + skip_if_aot_lib_not_loaded = pytest.mark.skip( + "Ethos-U tests can only run on FVP with quantize_io=True." + ) + + for item in items: + if re.search(r"u55|u65|u85", item.name, re.IGNORECASE): + item.add_marker(skip_if_aot_lib_not_loaded) + + +def pytest_addoption(parser): + parser.addoption("--arm_quantize_io", action="store_true") + parser.addoption("--arm_run_corstoneFVP", action="store_true") + parser.addoption("--fast_fvp", action="store_true") + + +def pytest_sessionstart(session): + pass + + +def pytest_sessionfinish(session, exitstatus): + pass + + +# ==== End of Pytest hooks ===== + + +# ==== Pytest fixtures ===== + + +@pytest.fixture(autouse=True) +def set_random_seed(): + """ + Control random numbers in Arm test suite. Default behavior is random seed, + which is set before each test. Use the env variable ARM_TEST_SEED to set the + seed you want to use to overrride the default behavior. Or set it to RANDOM + if you want to be explicit. + + Examples: + As default use random seed for each test + ARM_TEST_SEED=RANDOM pytest --config-file=/dev/null --verbose -s --color=yes backends/arm/test/ops/test_avg_pool.py -k + Rerun with a specific seed found under a random seed test + ARM_TEST_SEED=3478246 pytest --config-file=/dev/null --verbose -s --color=yes backends/arm/test/ops/test_avg_pool.py -k + """ + if os.environ.get("ARM_TEST_SEED", "RANDOM") == "RANDOM": + random.seed() # reset seed, in case any other test has fiddled with it + seed = random.randint(0, 2**32 - 1) + torch.manual_seed(seed) + else: + seed_str = os.environ.get("ARM_TEST_SEED", "0") + if str.isdigit(seed_str): + seed = int(seed_str) + random.seed(seed) + torch.manual_seed(seed) + else: + raise TypeError( + "ARM_TEST_SEED env variable must be integers or the string RANDOM" + ) + + print(f" ARM_TEST_SEED={seed} ", end=" ") + + +# ==== End of Pytest fixtures ===== + + +# ==== Custom Pytest decorators ===== + + +def expectedFailureOnFVP(test_item): + if is_option_enabled("corstone_fvp"): + test_item.__unittest_expecting_failure__ = True + return test_item + + +# ==== End of Custom Pytest decorators ===== + + +def is_option_enabled( + option: str | arm_test_options, fail_if_not_enabled: bool = False +) -> bool: + """ + Returns whether an option is successfully enabled, i.e. if the flag was + given to pytest and the necessary requirements are available. + Implemented options are: + - corstone_fvp. + - quantize_io. + + The optional parameter 'fail_if_not_enabled' makes the function raise + a RuntimeError instead of returning False. + """ + if isinstance(option, str): + option = arm_test_options[option.lower()] + + if option in _test_options and _test_options[option]: + return True + else: + if fail_if_not_enabled: + raise RuntimeError(f"Required option '{option}' for test is not enabled") + else: + return False + + +def get_option(option: arm_test_options) -> Any | None: + """ + Returns the value of an pytest option if it is set, otherwise None. + + Args: + option (arm_test_options): The option to check for. + """ + if option in _test_options: + return _test_options[option] + return None + + +def _load_libquantized_ops_aot_lib(): + """ + Load the libquantized_ops_aot_lib shared library. It's required when + arm_quantize_io is set. + """ + so_ext = { + "Darwin": "dylib", + "Linux": "so", + "Windows": "dll", + }.get(platform.system(), None) + + find_lib_cmd = [ + "find", + "cmake-out-aot-lib", + "-name", + f"libquantized_ops_aot_lib.{so_ext}", + ] + + res = subprocess.run(find_lib_cmd, capture_output=True) + if res.returncode == 0: + library_path = res.stdout.decode().strip() + torch.ops.load_library(library_path) + else: + raise RuntimeError( + f"Failed to load libquantized_ops_aot_lib.{so_ext}. Did you build it?" + ) diff --git a/backends/arm/test/misc/test_debug_feats.py b/backends/arm/test/misc/test_debug_feats.py index 4cac39af70..3343ae748c 100644 --- a/backends/arm/test/misc/test_debug_feats.py +++ b/backends/arm/test/misc/test_debug_feats.py @@ -80,7 +80,9 @@ def _is_tosa_marker_in_file(self, tmp_file): def test_MI_artifact(self): model = Linear(20, 30) - tmp_file = os.path.join(tempfile.mkdtemp(), "tosa_dump_MI.txt") + tmp_file = common.get_time_formatted_path( + tempfile.mkdtemp(), self._testMethodName + ) self._tosa_MI_pipeline(model, dump_file=tmp_file) assert os.path.exists(tmp_file), f"File {tmp_file} was not created" if self._is_tosa_marker_in_file(tmp_file): @@ -89,7 +91,9 @@ def test_MI_artifact(self): def test_BI_artifact(self): model = Linear(20, 30) - tmp_file = os.path.join(tempfile.mkdtemp(), "tosa_dump_BI.txt") + tmp_file = common.get_time_formatted_path( + tempfile.mkdtemp(), self._testMethodName + ) self._tosa_BI_pipeline(model, dump_file=tmp_file) assert os.path.exists(tmp_file), f"File {tmp_file} was not created" if self._is_tosa_marker_in_file(tmp_file): diff --git a/backends/arm/test/models/test_mobilenet_v2_arm.py b/backends/arm/test/models/test_mobilenet_v2_arm.py index 19b4254575..24af9cf41a 100644 --- a/backends/arm/test/models/test_mobilenet_v2_arm.py +++ b/backends/arm/test/models/test_mobilenet_v2_arm.py @@ -9,7 +9,7 @@ import unittest import torch -from executorch.backends.arm.test import common +from executorch.backends.arm.test import common, conftest from executorch.backends.arm.test.tester.arm_tester import ArmTester from executorch.exir import EdgeCompileConfig @@ -96,7 +96,7 @@ def test_mv2_u55_BI(self): .to_executorch() .serialize() ) - if common.is_option_enabled("corstone300"): + if conftest.is_option_enabled("corstone_fvp"): tester.run_method_and_compare_outputs( atol=1.0, qtol=1, inputs=self.model_inputs, target_board="corstone-300" ) @@ -114,7 +114,7 @@ def test_mv2_u85_BI(self): .to_executorch() .serialize() ) - if common.is_option_enabled("corstone300"): + if conftest.is_option_enabled("corstone_fvp"): tester.run_method_and_compare_outputs( atol=1.0, qtol=1, inputs=self.model_inputs, target_board="corstone-320" ) diff --git a/backends/arm/test/ops/test_add.py b/backends/arm/test/ops/test_add.py index 6676a38add..f40037f62f 100644 --- a/backends/arm/test/ops/test_add.py +++ b/backends/arm/test/ops/test_add.py @@ -10,7 +10,7 @@ from typing import Tuple import torch -from executorch.backends.arm.test import common +from executorch.backends.arm.test import common, conftest from executorch.backends.arm.test.tester.arm_tester import ArmTester from executorch.exir import EdgeCompileConfig from executorch.exir.backend.compile_spec_schema import CompileSpec @@ -115,7 +115,7 @@ def _test_add_ethos_BI_pipeline( .to_executorch() .serialize() ) - if common.is_option_enabled("corstone300"): + if conftest.is_option_enabled("corstone_fvp"): tester.run_method_and_compare_outputs(qtol=1, inputs=test_data) return tester diff --git a/backends/arm/test/ops/test_avg_pool.py b/backends/arm/test/ops/test_avg_pool.py index ad3ddf8c0a..4801849949 100644 --- a/backends/arm/test/ops/test_avg_pool.py +++ b/backends/arm/test/ops/test_avg_pool.py @@ -14,7 +14,7 @@ ArmQuantizer, get_symmetric_quantization_config, ) -from executorch.backends.arm.test import common +from executorch.backends.arm.test import common, conftest from executorch.backends.arm.test.tester.arm_tester import ArmTester from executorch.backends.xnnpack.test.tester.tester import Quantize from executorch.exir.backend.backend_details import CompileSpec @@ -118,7 +118,7 @@ def _test_avgpool2d_tosa_ethos_BI_pipeline( .to_executorch() .serialize() ) - if common.is_option_enabled("corstone300"): + if conftest.is_option_enabled("corstone_fvp"): tester.run_method_and_compare_outputs(qtol=1, inputs=test_data) @parameterized.expand(test_data_suite) diff --git a/backends/arm/test/ops/test_bmm.py b/backends/arm/test/ops/test_bmm.py index 523a90cdc8..0952d2595f 100644 --- a/backends/arm/test/ops/test_bmm.py +++ b/backends/arm/test/ops/test_bmm.py @@ -9,7 +9,7 @@ from typing import Tuple import torch -from executorch.backends.arm.test import common +from executorch.backends.arm.test import common, conftest from executorch.backends.arm.test.tester.arm_tester import ArmTester from executorch.exir.backend.compile_spec_schema import CompileSpec from parameterized import parameterized @@ -112,7 +112,7 @@ def _test_bmm_ethosu_BI_pipeline( .to_executorch() .serialize() ) - if common.is_option_enabled("corstone300"): + if conftest.is_option_enabled("corstone_fvp"): tester.run_method_and_compare_outputs(inputs=test_data, qtol=1) @parameterized.expand(BMM.test_parameters) @@ -161,7 +161,7 @@ def test_bmm_u85_BI(self, operand1: torch.Tensor, operand2: torch.Tensor): ) @parameterized.expand(BMM.test_parameters[1:]) - @common.expectedFailureOnFVP + @conftest.expectedFailureOnFVP def test_bmm_u85_BI_xfails(self, operand1: torch.Tensor, operand2: torch.Tensor): test_data = (operand1, operand2) self._test_bmm_ethosu_BI_pipeline( diff --git a/backends/arm/test/ops/test_cat.py b/backends/arm/test/ops/test_cat.py index 88846369d0..bf436a8c18 100644 --- a/backends/arm/test/ops/test_cat.py +++ b/backends/arm/test/ops/test_cat.py @@ -10,8 +10,7 @@ from typing import Tuple import torch -from executorch.backends.arm.test import common - +from executorch.backends.arm.test import common, conftest from executorch.backends.arm.test.tester.arm_tester import ArmTester from executorch.exir.backend.compile_spec_schema import CompileSpec from parameterized import parameterized @@ -114,7 +113,7 @@ def _test_cat_ethosu_BI_pipeline( .to_executorch() .serialize() ) - if common.is_option_enabled("corstone300"): + if conftest.is_option_enabled("corstone_fvp"): tester.run_method_and_compare_outputs(inputs=test_data) @parameterized.expand(Cat.test_parameters) @@ -135,7 +134,7 @@ def test_cat_tosa_BI(self, operands: tuple[torch.Tensor, ...], dim: int): # Mismatch in provided number of inputs and model signature, MLETORCH 519 @parameterized.expand(Cat.test_parameters) - @common.expectedFailureOnFVP + @conftest.expectedFailureOnFVP def test_cat_u55_BI(self, operands: tuple[torch.Tensor, ...], dim: int): test_data = (operands, dim) self._test_cat_ethosu_BI_pipeline( @@ -144,7 +143,7 @@ def test_cat_u55_BI(self, operands: tuple[torch.Tensor, ...], dim: int): # Mismatch in provided number of inputs and model signature, MLETORCH 519 @parameterized.expand(Cat.test_parameters) - @common.expectedFailureOnFVP + @conftest.expectedFailureOnFVP def test_cat_u85_BI(self, operands: tuple[torch.Tensor, ...], dim: int): test_data = (operands, dim) self._test_cat_ethosu_BI_pipeline( diff --git a/backends/arm/test/ops/test_clone.py b/backends/arm/test/ops/test_clone.py index 6b5216a8e1..2e7726a0bc 100644 --- a/backends/arm/test/ops/test_clone.py +++ b/backends/arm/test/ops/test_clone.py @@ -17,7 +17,7 @@ ArmQuantizer, get_symmetric_quantization_config, ) -from executorch.backends.arm.test import common +from executorch.backends.arm.test import common, conftest from executorch.backends.arm.test.tester.arm_tester import ArmTester from executorch.backends.xnnpack.test.tester.tester import Quantize @@ -96,7 +96,7 @@ def _test_clone_tosa_ethos_pipeline( .to_executorch() .serialize() ) - if common.is_option_enabled("corstone300"): + if conftest.is_option_enabled("corstone_fvp"): tester.run_method_and_compare_outputs(qtol=1, inputs=test_data) def _test_clone_tosa_u55_pipeline( diff --git a/backends/arm/test/ops/test_conv1d.py b/backends/arm/test/ops/test_conv1d.py index f00c7984a1..e6e027ed6e 100644 --- a/backends/arm/test/ops/test_conv1d.py +++ b/backends/arm/test/ops/test_conv1d.py @@ -9,8 +9,7 @@ from typing import List, Optional, Tuple, Union import torch -from executorch.backends.arm.test import common - +from executorch.backends.arm.test import common, conftest from executorch.backends.arm.test.tester.arm_tester import ArmTester from executorch.exir.backend.backend_details import CompileSpec from parameterized import parameterized @@ -279,7 +278,7 @@ def _test_conv1d_ethosu_BI_pipeline( .to_executorch() .serialize() ) - if common.is_option_enabled("corstone300"): + if conftest.is_option_enabled("corstone_fvp"): tester.run_method_and_compare_outputs(qtol=1, inputs=test_data) @parameterized.expand(testsuite) diff --git a/backends/arm/test/ops/test_conv2d.py b/backends/arm/test/ops/test_conv2d.py index 21df4bf0d5..222945cd16 100644 --- a/backends/arm/test/ops/test_conv2d.py +++ b/backends/arm/test/ops/test_conv2d.py @@ -9,8 +9,7 @@ from typing import List, Optional, Tuple, Union import torch -from executorch.backends.arm.test import common - +from executorch.backends.arm.test import common, conftest from executorch.backends.arm.test.tester.arm_tester import ArmTester from executorch.exir.backend.compile_spec_schema import CompileSpec from parameterized import parameterized @@ -310,7 +309,7 @@ def _test_conv2d_ethosu_BI_pipeline( .to_executorch() .serialize() ) - if common.is_option_enabled("corstone300"): + if conftest.is_option_enabled("corstone_fvp"): tester.run_method_and_compare_outputs(qtol=1, inputs=test_data) @parameterized.expand(testsuite) diff --git a/backends/arm/test/ops/test_conv_combos.py b/backends/arm/test/ops/test_conv_combos.py index 001c4a2bd5..86bf9cb632 100644 --- a/backends/arm/test/ops/test_conv_combos.py +++ b/backends/arm/test/ops/test_conv_combos.py @@ -12,7 +12,7 @@ import pytest import torch -from executorch.backends.arm.test import common +from executorch.backends.arm.test import common, conftest from executorch.backends.arm.test.tester.arm_tester import ArmTester from executorch.exir.backend.backend_details import CompileSpec from parameterized import parameterized @@ -253,7 +253,7 @@ def _test_conv_combo_ethos_BI_pipeline( .to_executorch() .serialize() ) - if common.is_option_enabled("corstone300"): + if conftest.is_option_enabled("corstone_fvp"): tester.run_method_and_compare_outputs(qtol=1, inputs=test_data) #################### diff --git a/backends/arm/test/ops/test_depthwise_conv.py b/backends/arm/test/ops/test_depthwise_conv.py index d753245f43..083e9aaf68 100644 --- a/backends/arm/test/ops/test_depthwise_conv.py +++ b/backends/arm/test/ops/test_depthwise_conv.py @@ -9,7 +9,7 @@ from typing import Tuple import torch -from executorch.backends.arm.test import common +from executorch.backends.arm.test import common, conftest from executorch.backends.arm.test.ops.test_conv1d import Conv1d from executorch.backends.arm.test.ops.test_conv2d import Conv2d @@ -243,7 +243,7 @@ def _test_dw_conv_ethos_BI_pipeline( .to_executorch() .serialize() ) - if common.is_option_enabled("corstone300"): + if conftest.is_option_enabled("corstone_fvp"): tester.run_method_and_compare_outputs(qtol=1, inputs=test_data) @parameterized.expand(testsuite_conv1d + testsuite_conv2d) @@ -301,7 +301,7 @@ def test_dw_conv_u85_BI( # All test cases except 3x3_1x3x256x256_gp3_st1 have numerical issues on FVP. MLETORCH-520 @parameterized.expand(testsuite_conv2d_u85_xfails) - @common.expectedFailureOnFVP + @conftest.expectedFailureOnFVP def test_dw_conv_u85_BI_xfails( self, test_name: str, model: torch.nn.Module, set_quantize_io: bool = False ): diff --git a/backends/arm/test/ops/test_div.py b/backends/arm/test/ops/test_div.py index 27febd714e..eaf6a21023 100644 --- a/backends/arm/test/ops/test_div.py +++ b/backends/arm/test/ops/test_div.py @@ -11,7 +11,7 @@ from typing import Optional, Tuple, Union import torch -from executorch.backends.arm.test import common +from executorch.backends.arm.test import common, conftest from executorch.backends.arm.test.tester.arm_tester import ArmTester from parameterized import parameterized @@ -157,7 +157,7 @@ def _test_div_ethos_BI_pipeline( .to_executorch() .serialize() ) - if common.is_option_enabled("corstone300"): + if conftest.is_option_enabled("corstone_fvp"): tester.run_method_and_compare_outputs(qtol=1, inputs=test_data) @parameterized.expand(test_data_suite) @@ -198,7 +198,7 @@ def test_div_u55_BI( # Numerical issues on FVP likely due to mul op, MLETORCH-521 @parameterized.expand(test_data_suite[2:]) - @common.expectedFailureOnFVP + @conftest.expectedFailureOnFVP def test_div_u55_BI_xfails( self, test_name: str, @@ -226,7 +226,7 @@ def test_div_u85_BI( # Numerical issues on FVP likely due to mul op, MLETORCH-521 @parameterized.expand(test_data_suite[2:]) - @common.expectedFailureOnFVP + @conftest.expectedFailureOnFVP def test_div_u85_BI_xfails( self, test_name: str, diff --git a/backends/arm/test/ops/test_exp.py b/backends/arm/test/ops/test_exp.py index f33e0a9058..57cd23bb14 100644 --- a/backends/arm/test/ops/test_exp.py +++ b/backends/arm/test/ops/test_exp.py @@ -10,7 +10,7 @@ from typing import Tuple import torch -from executorch.backends.arm.test import common +from executorch.backends.arm.test import common, conftest from executorch.backends.arm.test.tester.arm_tester import ArmTester from executorch.exir.backend.backend_details import CompileSpec from parameterized import parameterized @@ -95,7 +95,7 @@ def _test_exp_ethosu_BI_pipeline( .to_executorch() .serialize() ) - if common.is_option_enabled("corstone300"): + if conftest.is_option_enabled("corstone_fvp"): tester.run_method_and_compare_outputs(qtol=1, inputs=test_data) @parameterized.expand(test_data_suite) diff --git a/backends/arm/test/ops/test_expand.py b/backends/arm/test/ops/test_expand.py index 27f311b546..05f72aa379 100644 --- a/backends/arm/test/ops/test_expand.py +++ b/backends/arm/test/ops/test_expand.py @@ -17,7 +17,7 @@ ArmQuantizer, get_symmetric_quantization_config, ) -from executorch.backends.arm.test import common +from executorch.backends.arm.test import common, conftest from executorch.backends.arm.test.tester.arm_tester import ArmTester from executorch.backends.xnnpack.test.tester.tester import Quantize @@ -97,7 +97,7 @@ def _test_expand_ethosu_BI_pipeline( .to_executorch() .serialize() ) - if common.is_option_enabled("corstone300"): + if conftest.is_option_enabled("corstone_fvp"): tester.run_method_and_compare_outputs(qtol=1, inputs=test_data) @parameterized.expand(Expand.test_parameters) @@ -110,7 +110,7 @@ def test_expand_tosa_BI(self, test_input, multiples): # Mismatch in provided number of inputs and model signature, MLETORCH 519 @parameterized.expand(Expand.test_parameters) - @common.expectedFailureOnFVP + @conftest.expectedFailureOnFVP def test_expand_u55_BI(self, test_input, multiples): self._test_expand_ethosu_BI_pipeline( common.get_u55_compile_spec(), self.Expand(), (test_input, multiples) @@ -118,7 +118,7 @@ def test_expand_u55_BI(self, test_input, multiples): # Mismatch in provided number of inputs and model signature, MLETORCH 519 @parameterized.expand(Expand.test_parameters) - @common.expectedFailureOnFVP + @conftest.expectedFailureOnFVP def test_expand_u85_BI(self, test_input, multiples): self._test_expand_ethosu_BI_pipeline( common.get_u85_compile_spec(), self.Expand(), (test_input, multiples) diff --git a/backends/arm/test/ops/test_full.py b/backends/arm/test/ops/test_full.py index 9857a7b87b..2ee41f8bc1 100644 --- a/backends/arm/test/ops/test_full.py +++ b/backends/arm/test/ops/test_full.py @@ -13,7 +13,7 @@ from typing import Tuple import torch -from executorch.backends.arm.test import common +from executorch.backends.arm.test import common, conftest from executorch.backends.arm.test.tester.arm_tester import ArmTester from executorch.exir.backend.compile_spec_schema import CompileSpec from parameterized import parameterized @@ -109,7 +109,7 @@ def _test_full_tosa_ethos_pipeline( .to_executorch() .serialize() ) - if common.is_option_enabled("corstone300"): + if conftest.is_option_enabled("corstone_fvp"): tester.run_method_and_compare_outputs(qtol=1, inputs=test_data) def _test_full_tosa_u55_pipeline(self, module: torch.nn.Module, test_data: Tuple): @@ -145,7 +145,7 @@ def test_full_tosa_BI(self, test_tensor: Tuple): # Mismatch in provided number of inputs and model signature, MLETORCH 519 @parameterized.expand(AddVariableFull.test_parameters) - @common.expectedFailureOnFVP + @conftest.expectedFailureOnFVP def test_full_u55_BI(self, test_tensor: Tuple): self._test_full_tosa_u55_pipeline( self.AddVariableFull(), @@ -154,7 +154,7 @@ def test_full_u55_BI(self, test_tensor: Tuple): # Mismatch in provided number of inputs and model signature, MLETORCH 519 @parameterized.expand(AddVariableFull.test_parameters) - @common.expectedFailureOnFVP + @conftest.expectedFailureOnFVP def test_full_u85_BI(self, test_tensor: Tuple): self._test_full_tosa_u85_pipeline( self.AddVariableFull(), diff --git a/backends/arm/test/ops/test_hardtanh.py b/backends/arm/test/ops/test_hardtanh.py index 10073c5095..1c763e8167 100644 --- a/backends/arm/test/ops/test_hardtanh.py +++ b/backends/arm/test/ops/test_hardtanh.py @@ -15,7 +15,7 @@ get_symmetric_quantization_config, ) -from executorch.backends.arm.test import common +from executorch.backends.arm.test import common, conftest from executorch.backends.arm.test.tester.arm_tester import ArmTester from executorch.backends.xnnpack.test.tester.tester import Quantize from parameterized import parameterized @@ -108,7 +108,7 @@ def _test_hardtanh_tosa_ethosu_BI_pipeline( .to_executorch() .serialize() ) - if common.is_option_enabled("corstone300"): + if conftest.is_option_enabled("corstone_fvp"): tester.run_method_and_compare_outputs(qtol=1, inputs=test_data) @parameterized.expand(test_data_suite) diff --git a/backends/arm/test/ops/test_layer_norm.py b/backends/arm/test/ops/test_layer_norm.py index e84dd4ee58..a4d3bc5adf 100644 --- a/backends/arm/test/ops/test_layer_norm.py +++ b/backends/arm/test/ops/test_layer_norm.py @@ -8,7 +8,7 @@ from typing import List, Tuple, Union import torch -from executorch.backends.arm.test import common +from executorch.backends.arm.test import common, conftest from executorch.backends.arm.test.tester.arm_tester import ArmTester from executorch.exir.backend.backend_details import CompileSpec from parameterized import parameterized @@ -130,7 +130,7 @@ def _test_layernorm_ethosu_BI_pipeline( .to_executorch() .serialize() ) - if common.is_option_enabled("corstone300"): + if conftest.is_option_enabled("corstone_fvp"): tester.run_method_and_compare_outputs(qtol=1, inputs=test_data) @parameterized.expand(test_data_suite) diff --git a/backends/arm/test/ops/test_linear.py b/backends/arm/test/ops/test_linear.py index 30d4b2890a..8aabd365af 100644 --- a/backends/arm/test/ops/test_linear.py +++ b/backends/arm/test/ops/test_linear.py @@ -11,7 +11,7 @@ from typing import Tuple import torch -from executorch.backends.arm.test import common +from executorch.backends.arm.test import common, conftest from executorch.backends.arm.test.tester.arm_tester import ArmTester from executorch.exir import EdgeCompileConfig @@ -247,7 +247,7 @@ def test_linear_tosa_u55_BI( test_data, ) - if common.is_option_enabled("corstone300"): + if conftest.is_option_enabled("corstone_fvp"): tester.run_method_and_compare_outputs(qtol=1, inputs=test_data) @parameterized.expand(test_data_suite_rank1 + test_data_suite_rank4) diff --git a/backends/arm/test/ops/test_log.py b/backends/arm/test/ops/test_log.py index 10175d27fb..4dd1fc97c7 100644 --- a/backends/arm/test/ops/test_log.py +++ b/backends/arm/test/ops/test_log.py @@ -10,7 +10,7 @@ from typing import Tuple import torch -from executorch.backends.arm.test import common +from executorch.backends.arm.test import common, conftest from executorch.backends.arm.test.tester.arm_tester import ArmTester from executorch.exir.backend.backend_details import CompileSpec from parameterized import parameterized @@ -95,7 +95,7 @@ def _test_log_ethosu_BI_pipeline( .to_executorch() .serialize() ) - if common.is_option_enabled("corstone300"): + if conftest.is_option_enabled("corstone_fvp"): tester.run_method_and_compare_outputs(qtol=1, inputs=test_data) @parameterized.expand(test_data_suite) diff --git a/backends/arm/test/ops/test_max_pool.py b/backends/arm/test/ops/test_max_pool.py index 41526b1c77..3a12616df6 100644 --- a/backends/arm/test/ops/test_max_pool.py +++ b/backends/arm/test/ops/test_max_pool.py @@ -15,7 +15,7 @@ ArmQuantizer, get_symmetric_quantization_config, ) -from executorch.backends.arm.test import common +from executorch.backends.arm.test import common, conftest from executorch.backends.arm.test.tester.arm_tester import ArmTester from executorch.backends.xnnpack.test.tester.tester import Quantize @@ -171,7 +171,7 @@ def test_maxpool2d_tosa_u55_BI( common.get_u55_compile_spec(permute_memory_to_nhwc=True), (test_data,), ) - if common.is_option_enabled("corstone300"): + if conftest.is_option_enabled("corstone_fvp"): tester.run_method_and_compare_outputs( qtol=1, inputs=(test_data,), target_board="corstone-300" ) @@ -188,7 +188,7 @@ def test_maxpool2d_tosa_u85_BI( common.get_u85_compile_spec(permute_memory_to_nhwc=True), (test_data,), ) - if common.is_option_enabled("corstone300"): + if conftest.is_option_enabled("corstone_fvp"): tester.run_method_and_compare_outputs( qtol=1, inputs=(test_data,), target_board="corstone-320" ) @@ -216,7 +216,7 @@ def test_maxpool2d_tosa_BI_mult_batches( ) @parameterized.expand(test_data_suite_mult_batches) - @common.expectedFailureOnFVP # TODO: MLETORCH-433 + @conftest.expectedFailureOnFVP # TODO: MLETORCH-433 def test_maxpool2d_tosa_u55_BI_mult_batches( self, test_name: str, @@ -228,13 +228,13 @@ def test_maxpool2d_tosa_u55_BI_mult_batches( common.get_u55_compile_spec(permute_memory_to_nhwc=True), (test_data,), ) - if common.is_option_enabled("corstone300"): + if conftest.is_option_enabled("corstone_fvp"): tester.run_method_and_compare_outputs( qtol=1, inputs=(test_data,), target_board="corstone-300" ) @parameterized.expand(test_data_suite_mult_batches) - @common.expectedFailureOnFVP # TODO: MLETORCH-433 + @conftest.expectedFailureOnFVP # TODO: MLETORCH-433 def test_maxpool2d_tosa_u85_BI_mult_batches( self, test_name: str, @@ -246,7 +246,7 @@ def test_maxpool2d_tosa_u85_BI_mult_batches( common.get_u85_compile_spec(permute_memory_to_nhwc=True), (test_data,), ) - if common.is_option_enabled("corstone300"): + if conftest.is_option_enabled("corstone_fvp"): tester.run_method_and_compare_outputs( qtol=1, inputs=(test_data,), target_board="corstone-320" ) diff --git a/backends/arm/test/ops/test_mul.py b/backends/arm/test/ops/test_mul.py index 6d6922628e..ced71b0072 100644 --- a/backends/arm/test/ops/test_mul.py +++ b/backends/arm/test/ops/test_mul.py @@ -8,7 +8,7 @@ import unittest import torch -from executorch.backends.arm.test import common +from executorch.backends.arm.test import common, conftest from executorch.backends.arm.test.tester.arm_tester import ArmTester from executorch.exir.backend.backend_details import CompileSpec from parameterized import parameterized @@ -128,7 +128,7 @@ def _test_mul_ethosu_BI_pipeline( .to_executorch() .serialize() ) - if common.is_option_enabled("corstone300"): + if conftest.is_option_enabled("corstone_fvp"): tester.run_method_and_compare_outputs(qtol=1, inputs=test_data) @parameterized.expand(test_data_sute) diff --git a/backends/arm/test/ops/test_permute.py b/backends/arm/test/ops/test_permute.py index 92400215b7..581cd3cfbc 100644 --- a/backends/arm/test/ops/test_permute.py +++ b/backends/arm/test/ops/test_permute.py @@ -15,7 +15,7 @@ get_symmetric_quantization_config, ) -from executorch.backends.arm.test import common +from executorch.backends.arm.test import common, conftest from executorch.backends.arm.test.tester.arm_tester import ArmTester from executorch.backends.xnnpack.test.tester.tester import Quantize from executorch.exir.backend.compile_spec_schema import CompileSpec @@ -117,7 +117,7 @@ def _test_permute_ethos_BI_pipeline( .to_executorch() .serialize() ) - if common.is_option_enabled("corstone300"): + if conftest.is_option_enabled("corstone_fvp"): tester.run_method_and_compare_outputs(qtol=1, inputs=test_data) @parameterized.expand(test_data_suite) @@ -155,7 +155,7 @@ def test_permute_u85_BI( # Fails since on FVP since N > 1 is not supported. MLETORCH-517 @parameterized.expand(test_data_suite[-2:]) - @common.expectedFailureOnFVP + @conftest.expectedFailureOnFVP def test_permute_u85_BI_xfails( self, test_name: str, test_data: torch.Tensor, dims: list[int] ): diff --git a/backends/arm/test/ops/test_reciprocal.py b/backends/arm/test/ops/test_reciprocal.py index 876f063c76..a71396caf3 100644 --- a/backends/arm/test/ops/test_reciprocal.py +++ b/backends/arm/test/ops/test_reciprocal.py @@ -7,7 +7,7 @@ import unittest import torch -from executorch.backends.arm.test import common +from executorch.backends.arm.test import common, conftest from executorch.backends.arm.test.tester.arm_tester import ArmTester from parameterized import parameterized @@ -97,7 +97,7 @@ def _test_reciprocal_u55_BI_pipeline( .to_executorch() .serialize() ) - if common.is_option_enabled("corstone300"): + if conftest.is_option_enabled("corstone_fvp"): tester.run_method_and_compare_outputs(qtol=1, inputs=test_data) @parameterized.expand(test_data_suite) diff --git a/backends/arm/test/ops/test_sub.py b/backends/arm/test/ops/test_sub.py index 327a8de994..0592141028 100644 --- a/backends/arm/test/ops/test_sub.py +++ b/backends/arm/test/ops/test_sub.py @@ -10,8 +10,7 @@ from typing import Tuple import torch -from executorch.backends.arm.test import common - +from executorch.backends.arm.test import common, conftest from executorch.backends.arm.test.tester.arm_tester import ArmTester from executorch.exir.backend.compile_spec_schema import CompileSpec from parameterized import parameterized @@ -98,7 +97,7 @@ def _test_sub_ethosu_BI_pipeline( .to_executorch() .serialize() ) - if common.is_option_enabled("corstone300"): + if conftest.is_option_enabled("corstone_fvp"): tester.run_method_and_compare_outputs(qtol=1, inputs=test_data) @parameterized.expand(Sub.test_parameters) diff --git a/backends/arm/test/runner_utils.py b/backends/arm/test/runner_utils.py index b61c1b465f..a8a113cf93 100644 --- a/backends/arm/test/runner_utils.py +++ b/backends/arm/test/runner_utils.py @@ -17,7 +17,7 @@ import numpy as np import torch -from executorch.backends.arm.test.common import arm_test_options, is_option_enabled +from executorch.backends.arm.test.conftest import arm_test_options, is_option_enabled from torch.export import ExportedProgram from torch.fx.node import Node @@ -218,7 +218,7 @@ def run_corstone( assert ( self._has_init_run - ), "RunnerUtil needs to be initialized using init_run() before running Corstone300." + ), "RunnerUtil needs to be initialized using init_run() before running Corstone FVP." if self.target_board not in ["corstone-300", "corstone-320"]: raise RuntimeError(f"Unknown target board: {self.target_board}") diff --git a/backends/arm/test/tester/arm_tester.py b/backends/arm/test/tester/arm_tester.py index 3564a3325a..6784605bb4 100644 --- a/backends/arm/test/tester/arm_tester.py +++ b/backends/arm/test/tester/arm_tester.py @@ -22,12 +22,7 @@ ArmQuantizer, get_symmetric_quantization_config, ) -from executorch.backends.arm.test.common import ( - arm_test_options, - current_time_formated, - get_option, - get_target_board, -) +from executorch.backends.arm.test.common import get_target_board from executorch.backends.arm.test.runner_utils import ( _get_input_quantization_params, @@ -626,9 +621,6 @@ def _get_tosa_operator_distribution( def _dump_str(to_print: str, path_to_dump: Optional[str] = None): - default_dump_path = get_option(arm_test_options.dump_path) - if not path_to_dump and default_dump_path: - path_to_dump = default_dump_path / f"ArmTester_{current_time_formated()}.log" if path_to_dump: with open(path_to_dump, "a") as fp: fp.write(to_print) diff --git a/examples/arm/README.md b/examples/arm/README.md index 717a96c13e..bb68ef537b 100644 --- a/examples/arm/README.md +++ b/examples/arm/README.md @@ -24,7 +24,7 @@ To run these scripts. On a Linux system, in a terminal, with a working internet $ ./setup.sh --i-agree-to-the-contained-eula [optional-scratch-dir] # Step [2] - build + run ExecuTorch and executor_runner baremetal application -# suited for Corstone300 to run a simple PyTorch model. +# suited for Corstone FVP's to run a simple PyTorch model. $ ./run.sh [--scratch-dir=same-optional-scratch-dir-as-before] ``` ### Online Tutorial From 2d499b3d0cf0b085e373404e5ff421a73a4a22b4 Mon Sep 17 00:00:00 2001 From: Tarun Karuturi <58826100+tarun292@users.noreply.github.com> Date: Mon, 2 Dec 2024 01:55:31 -0800 Subject: [PATCH 27/27] Add quantize_per_channel and dequantize_per_channel to q_dq_ops target Differential Revision: D66400800 Pull Request resolved: https://github.com/pytorch/executorch/pull/7045 --- kernels/quantized/targets.bzl | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernels/quantized/targets.bzl b/kernels/quantized/targets.bzl index 13ef166ece..5440400612 100644 --- a/kernels/quantized/targets.bzl +++ b/kernels/quantized/targets.bzl @@ -69,6 +69,8 @@ def define_common_targets(): "quantized_decomposed::dequantize_per_tensor.Tensor_out", "quantized_decomposed::quantize_per_tensor.out", "quantized_decomposed::quantize_per_tensor.Tensor_out", + "quantized_decomposed::dequantize_per_channel.out", + "quantized_decomposed::quantize_per_channel.out", ], )