From 7af39c6955a48ea4b7e1f8f7f354377e75fe6f44 Mon Sep 17 00:00:00 2001 From: Changming Sun Date: Tue, 23 Jul 2024 11:03:55 -0700 Subject: [PATCH 01/57] Update nodejs's cmake file to fix a file copy issue (#21390) This commit e5f18ba2c14ced91e5f483fde0a7ef4b3b04abbe caused some nightly pipelines to fail. This PR fixes it. It is because recently I changed our Linux library's SONAME. At runtime onnxruntime_binding depends on libonnxruntime.so.1 , instead of libonnxruntime.so.1.19.0(with the full version number). Therefore we need to keep the libonnxruntime.so.1 symlink. The packaging tools/ci_build/github/js/pack-npm-packages.ps1 still needs be updated. I will address it in another PR. --- js/node/CMakeLists.txt | 62 ++++++++++++++---------------------------- 1 file changed, 20 insertions(+), 42 deletions(-) diff --git a/js/node/CMakeLists.txt b/js/node/CMakeLists.txt index 5c32f62f3a802..1ce6d66881c3e 100644 --- a/js/node/CMakeLists.txt +++ b/js/node/CMakeLists.txt @@ -59,8 +59,18 @@ endif() file(GLOB ORT_NODEJS_BINDING_SOURCE_FILES ${CMAKE_SOURCE_DIR}/src/*.cc) add_library(onnxruntime_binding SHARED ${ORT_NODEJS_BINDING_SOURCE_FILES} ${CMAKE_JS_SRC}) +file(MAKE_DIRECTORY ${dist_folder}) + set_target_properties(onnxruntime_binding PROPERTIES PREFIX "" SUFFIX ".node" + RUNTIME_OUTPUT_DIRECTORY ${dist_folder} + RUNTIME_OUTPUT_DIRECTORY_DEBUG ${dist_folder} + RUNTIME_OUTPUT_DIRECTORY_RELEASE ${dist_folder} + RUNTIME_OUTPUT_DIRECTORY_RELWITHDEBINFO ${dist_folder} + LIBRARY_OUTPUT_DIRECTORY ${dist_folder} + LIBRARY_OUTPUT_DIRECTORY_DEBUG ${dist_folder} + LIBRARY_OUTPUT_DIRECTORY_RELEASE ${dist_folder} + LIBRARY_OUTPUT_DIRECTORY_RELWITHDEBINFO ${dist_folder} BUILD_WITH_INSTALL_RPATH TRUE INSTALL_RPATH_USE_LINK_PATH FALSE) target_link_libraries(onnxruntime_binding PRIVATE ${CMAKE_JS_LIB}) @@ -86,61 +96,29 @@ else() endif() if (WIN32) - target_link_libraries(onnxruntime_binding PRIVATE onnxruntime.lib) + target_link_libraries(onnxruntime_binding PRIVATE onnxruntime) elseif (APPLE) target_link_libraries(onnxruntime_binding PRIVATE libonnxruntime.${ort_version}.dylib) set_target_properties(onnxruntime_binding PROPERTIES INSTALL_RPATH "@loader_path") else() - target_link_libraries(onnxruntime_binding PRIVATE libonnxruntime.so.${ort_version}) + target_link_libraries(onnxruntime_binding PRIVATE onnxruntime) set_target_properties(onnxruntime_binding PROPERTIES INSTALL_RPATH "$ORIGIN/") endif() -# post build - -add_custom_command( - TARGET onnxruntime_binding POST_BUILD - COMMAND ${CMAKE_COMMAND} -E make_directory ${dist_folder} - COMMAND ${CMAKE_COMMAND} -E copy $ ${dist_folder} -) if (WIN32) - add_custom_command( - TARGET onnxruntime_binding POST_BUILD - COMMAND ${CMAKE_COMMAND} -E copy - ${ONNXRUNTIME_WIN_BIN_DIR}/onnxruntime.dll - ${dist_folder} - ) + file(COPY ${ONNXRUNTIME_WIN_BIN_DIR}/onnxruntime.dll + DESTINATION ${dist_folder}) if (USE_DML) - add_custom_command( - TARGET onnxruntime_binding POST_BUILD - COMMAND ${CMAKE_COMMAND} -E copy - ${ONNXRUNTIME_WIN_BIN_DIR}/DirectML.dll - ${dist_folder} - ) + file(COPY ${ONNXRUNTIME_WIN_BIN_DIR}/DirectML.dll + DESTINATION ${dist_folder}) endif () - if (CMAKE_BUILD_TYPE STREQUAL "Debug") - add_custom_command( - TARGET onnxruntime_binding POST_BUILD - COMMAND ${CMAKE_COMMAND} -E copy - ${ONNXRUNTIME_WIN_BIN_DIR}/onnxruntime.pdb - ${dist_folder} - COMMAND ${CMAKE_COMMAND} -E copy $/onnxruntime_binding.pdb ${dist_folder} - ) - endif() elseif (APPLE) - add_custom_command( - TARGET onnxruntime_binding POST_BUILD - COMMAND ${CMAKE_COMMAND} -E copy - ${ONNXRUNTIME_BUILD_DIR}/libonnxruntime.${ort_version}.dylib - ${dist_folder} - ) + file(COPY ${ONNXRUNTIME_BUILD_DIR}/libonnxruntime.dylib + DESTINATION ${dist_folder} FOLLOW_SYMLINK_CHAIN) elseif (UNIX) - add_custom_command( - TARGET onnxruntime_binding POST_BUILD - COMMAND ${CMAKE_COMMAND} -E copy - ${ONNXRUNTIME_BUILD_DIR}/libonnxruntime.so.${ort_version} - ${dist_folder} - ) + file(COPY ${ONNXRUNTIME_BUILD_DIR}/libonnxruntime.so + DESTINATION ${dist_folder} FOLLOW_SYMLINK_CHAIN) else() message(FATAL_ERROR "Platform not supported.") endif() From 2b7e2a5bd07a882a1a1f16e81025a74745ef0394 Mon Sep 17 00:00:00 2001 From: Tianlei Wu Date: Tue, 23 Jul 2024 11:58:04 -0700 Subject: [PATCH 02/57] [CUDA] Fix cuda provider fallback inconsistency (#21425) * Fix fallback setting (cuda still falls back to cuda). * Fix cuda provider fallback inconsistent with/without CUDA_PATH environment variable. * Add cuda and cudnn major version requirement in error message. Example result in Windows: ``` >>> import onnxruntime >>> ort_session = onnxruntime.InferenceSession("model.onnx", providers=['CUDAExecutionProvider', 'CPUExecutionProvider']) 2024-07-19 17:43:44.2260019 [E:onnxruntime:Default, provider_bridge_ort.cc:1972 onnxruntime::TryGetProviderInfo_CUDA] D:\onnxruntime\onnxruntime\core\session\provider_bridge_ort.cc:1636 onnxruntime::ProviderLibrary::Get [ONNXRuntimeError] : 1 : FAIL : LoadLibrary failed with error 126 "" when trying to load "C:\Users\.conda\envs\py310\lib\site-packages\onnxruntime\capi\onnxruntime_providers_cuda.dll" 2024-07-19 17:43:44.2312351 [W:onnxruntime:Default, onnxruntime_pybind_state.cc:970 onnxruntime::python::CreateExecutionProviderInstance] Failed to create CUDAExecutionProvider. Require cuDNN 9.* and CUDA 12.*, and the latest MSVC runtime. Please install all dependencies as mentioned in the GPU requirements page (https://onnxruntime.ai/docs/execution-providers/CUDA-ExecutionProvider.html#requirements), make sure they're in the PATH, and that your GPU is supported. >>> ort_session >>> ort_session.get_providers() ['CPUExecutionProvider'] ``` Example result in Linux: ``` >>> import onnxruntime >>> ort_session = onnxruntime.InferenceSession("resnet50-v2-7.onnx", providers=['CUDAExecutionProvider', 'CPUExecutionProvider']) 2024-07-20 20:33:26.486974543 [E:onnxruntime:Default, provider_bridge_ort.cc:1972 TryGetProviderInfo_CUDA] /work/onnxruntime/onnxruntime/core/session/provider_bridge_ort.cc:1636 onnxruntime::Provider& onnxruntime::ProviderLibrary::Get() [ONNXRuntimeError] : 1 : FAIL : Failed to load library libonnxruntime_providers_cuda.so with error: libcublasLt.so.12: cannot open shared object file: No such file or directory 2024-07-20 20:33:26.487034646 [W:onnxruntime:Default, onnxruntime_pybind_state.cc:961 CreateExecutionProviderInstance] Failed to create CUDAExecutionProvider. Require cuDNN 9.* and CUDA 12.*. Please install all dependencies as mentioned in the GPU requirements page (https://onnxruntime.ai/docs/execution-providers/CUDA-ExecutionProvider.html#requirements), make sure they're in the PATH, and that your GPU is supported. >>> ort_session.get_providers() ['CPUExecutionProvider'] ``` ### Motivation and Context https://github.com/microsoft/onnxruntime/issues/21424 --- cmake/onnxruntime_python.cmake | 8 +++++-- .../onnxruntime_inference_collection.py | 16 +++++++++---- .../python/onnxruntime_pybind_state.cc | 24 ++++++++++--------- 3 files changed, 31 insertions(+), 17 deletions(-) diff --git a/cmake/onnxruntime_python.cmake b/cmake/onnxruntime_python.cmake index 07c65e7986b05..270139ceaff7b 100644 --- a/cmake/onnxruntime_python.cmake +++ b/cmake/onnxruntime_python.cmake @@ -97,8 +97,12 @@ endif() onnxruntime_add_include_to_target(onnxruntime_pybind11_state Python::Module Python::NumPy) target_include_directories(onnxruntime_pybind11_state PRIVATE ${ONNXRUNTIME_ROOT} ${pybind11_INCLUDE_DIRS}) -if(onnxruntime_USE_CUDA AND onnxruntime_CUDNN_HOME) - target_include_directories(onnxruntime_pybind11_state PRIVATE ${onnxruntime_CUDNN_HOME}/include) +if(onnxruntime_USE_CUDA) + target_include_directories(onnxruntime_pybind11_state PRIVATE ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}) + # cudnn_home is optional for Window when cuda and cudnn are installed in the same directory. + if(onnxruntime_CUDNN_HOME) + target_include_directories(onnxruntime_pybind11_state PRIVATE ${onnxruntime_CUDNN_HOME}/include) + endif() endif() if(onnxruntime_USE_CANN) target_include_directories(onnxruntime_pybind11_state PRIVATE ${onnxruntime_CANN_HOME}/include) diff --git a/onnxruntime/python/onnxruntime_inference_collection.py b/onnxruntime/python/onnxruntime_inference_collection.py index ecae280e92ae5..c3cfe2c97ae95 100644 --- a/onnxruntime/python/onnxruntime_inference_collection.py +++ b/onnxruntime/python/onnxruntime_inference_collection.py @@ -438,10 +438,18 @@ def _create_inference_session(self, providers, provider_options, disabled_optimi # Tensorrt can fall back to CUDA if it's explicitly assigned. All others fall back to CPU. if "TensorrtExecutionProvider" in available_providers: - if providers and any( - provider == "CUDAExecutionProvider" - or (isinstance(provider, tuple) and provider[0] == "CUDAExecutionProvider") - for provider in providers + if ( + providers + and any( + provider == "CUDAExecutionProvider" + or (isinstance(provider, tuple) and provider[0] == "CUDAExecutionProvider") + for provider in providers + ) + and any( + provider == "TensorrtExecutionProvider" + or (isinstance(provider, tuple) and provider[0] == "TensorrtExecutionProvider") + for provider in providers + ) ): self._fallback_providers = ["CUDAExecutionProvider", "CPUExecutionProvider"] else: diff --git a/onnxruntime/python/onnxruntime_pybind_state.cc b/onnxruntime/python/onnxruntime_pybind_state.cc index d7155b2b6899a..6b5daf8cb882b 100644 --- a/onnxruntime/python/onnxruntime_pybind_state.cc +++ b/onnxruntime/python/onnxruntime_pybind_state.cc @@ -35,6 +35,11 @@ #include "contrib_ops/cpu/aten_ops/aten_op_executor.h" #endif +#ifdef USE_CUDA +#include // for CUDA_VERSION +#include // for CUDNN_MAJOR +#endif + #include // Explicitly provide a definition for the static const var 'GPU' in the OrtDevice struct, @@ -951,21 +956,18 @@ std::unique_ptr CreateExecutionProviderInstance( // external CUDA allocator. external_allocator_info = info.external_allocator_info; return cuda_provider_info->CreateExecutionProviderFactory(info)->CreateProvider(); - } else { - if (!Env::Default().GetEnvironmentVar("CUDA_PATH").empty()) { - ORT_THROW( - "CUDA_PATH is set but CUDA wasnt able to be loaded. Please install the correct version of CUDA and" - "cuDNN as mentioned in the GPU requirements page " - " (https://onnxruntime.ai/docs/execution-providers/CUDA-ExecutionProvider.html#requirements), " - " make sure they're in the PATH, and that your GPU is supported."); - } } } LOGS_DEFAULT(WARNING) << "Failed to create " << type - << ". Please reference " - << "https://onnxruntime.ai/docs/execution-providers/CUDA-ExecutionProvider.html#requirements" - << "to ensure all dependencies are met."; + << ". Require cuDNN " << CUDNN_MAJOR << ".* and " + << "CUDA " << (CUDA_VERSION / 1000) << ".*" +#if defined(_MSC_VER) + << ", and the latest MSVC runtime" +#endif + << ". Please install all dependencies as mentioned in the GPU requirements page" + " (https://onnxruntime.ai/docs/execution-providers/CUDA-ExecutionProvider.html#requirements), " + "make sure they're in the PATH, and that your GPU is supported."; #endif } else if (type == kRocmExecutionProvider) { #ifdef USE_ROCM From c65afcea551ad96e9247754d28914bc50c1eefca Mon Sep 17 00:00:00 2001 From: George Wu Date: Tue, 23 Jul 2024 15:54:44 -0700 Subject: [PATCH 03/57] fix python qnn pipelines issues (#21462) build_py_params wasn't plumbed through for python qnn pipelines. incorporate fixes for deprecated numpy version option from https://github.com/microsoft/onnxruntime/pull/21459 --- .../azure-pipelines/templates/py-packaging-stage.yml | 3 ++- .../azure-pipelines/templates/py-win-arm64-qnn.yml | 9 ++------- .../github/azure-pipelines/templates/py-win-x64-qnn.yml | 2 +- 3 files changed, 5 insertions(+), 9 deletions(-) diff --git a/tools/ci_build/github/azure-pipelines/templates/py-packaging-stage.yml b/tools/ci_build/github/azure-pipelines/templates/py-packaging-stage.yml index 27f85dc5c1648..17e64a207be2f 100644 --- a/tools/ci_build/github/azure-pipelines/templates/py-packaging-stage.yml +++ b/tools/ci_build/github/azure-pipelines/templates/py-packaging-stage.yml @@ -510,7 +510,7 @@ stages: MACHINE_POOL: 'onnxruntime-qnn-windows-vs-2022-arm64' QNN_SDK: ${{ parameters.qnn_sdk_version }} PYTHON_VERSION: '3.11' - NUMPY_VERSION: '1.26.4' + BUILD_PY_PARAMETERS: ${{ parameters.build_py_parameters }} - ${{ if eq(parameters.enable_windows_x64_qnn, true) }}: - stage: Python_Packaging_Windows_x64_QNN @@ -520,3 +520,4 @@ stages: parameters: MACHINE_POOL: 'Onnxruntime-QNNEP-Windows-2022-CPU' QNN_SDK: ${{ parameters.qnn_sdk_version }} + BUILD_PY_PARAMETERS: ${{ parameters.build_py_parameters }} diff --git a/tools/ci_build/github/azure-pipelines/templates/py-win-arm64-qnn.yml b/tools/ci_build/github/azure-pipelines/templates/py-win-arm64-qnn.yml index af239b4384af9..70221976d978f 100644 --- a/tools/ci_build/github/azure-pipelines/templates/py-win-arm64-qnn.yml +++ b/tools/ci_build/github/azure-pipelines/templates/py-win-arm64-qnn.yml @@ -13,10 +13,6 @@ parameters: type: string default: '3.11' -- name: NUMPY_VERSION - type: string - default: '1.26.4' - - name: ENV_SETUP_SCRIPT type: string default: '' @@ -70,7 +66,7 @@ jobs: scriptSource: inline script: | import subprocess - subprocess.call(['pip', 'install', '-q', 'setuptools', 'wheel', 'numpy==${{parameters.NUMPY_VERSION}}']) + subprocess.call(['pip', 'install', '-q', 'setuptools', 'wheel']) workingDirectory: '$(Build.BinariesDirectory)' displayName: 'Install python modules' @@ -93,7 +89,6 @@ jobs: --qnn_home $(QnnSDKRootDir) --enable_pybind --parallel --update - --numpy_version ${{ parameters.NUMPY_VERSION }} $(TelemetryOption) ${{ parameters.BUILD_PY_PARAMETERS }} workingDirectory: '$(Build.BinariesDirectory)' @@ -121,7 +116,7 @@ jobs: displayName: 'Build wheel' inputs: scriptPath: '$(Build.SourcesDirectory)\setup.py' - arguments: 'bdist_wheel ${{ parameters.BUILD_PY_PARAMETERS }} $(NightlyBuildOption) --wheel_name_suffix=qnn' + arguments: 'bdist_wheel $(NightlyBuildOption) --wheel_name_suffix=qnn' workingDirectory: '$(Build.BinariesDirectory)\RelWithDebInfo\RelWithDebInfo' - task: CopyFiles@2 diff --git a/tools/ci_build/github/azure-pipelines/templates/py-win-x64-qnn.yml b/tools/ci_build/github/azure-pipelines/templates/py-win-x64-qnn.yml index 884e6eafee965..1bf5db5ae6d9a 100644 --- a/tools/ci_build/github/azure-pipelines/templates/py-win-x64-qnn.yml +++ b/tools/ci_build/github/azure-pipelines/templates/py-win-x64-qnn.yml @@ -122,7 +122,7 @@ jobs: displayName: 'Build wheel' inputs: scriptPath: '$(Build.SourcesDirectory)\setup.py' - arguments: 'bdist_wheel ${{ parameters.BUILD_PY_PARAMETERS }} $(NightlyBuildOption) --wheel_name_suffix=qnn' + arguments: 'bdist_wheel $(NightlyBuildOption) --wheel_name_suffix=qnn' workingDirectory: '$(Build.BinariesDirectory)\RelWithDebInfo\RelWithDebInfo' - task: CopyFiles@2 From 86cedc68326a4680aed43a291b03ac15734652ba Mon Sep 17 00:00:00 2001 From: mingyueliuh <131847423+mingyueliuh@users.noreply.github.com> Date: Tue, 23 Jul 2024 19:51:00 -0400 Subject: [PATCH 04/57] [Fix] C++ API SetOutputShape for register custom op. (#21366) ### Description Bug fix for the SetOutputShape method in custom op shape inference. ### Motivation and Context - Bug a : A obvious bug that will cause all dimensions to be 1. https://github.com/microsoft/onnxruntime/blob/main/include/onnxruntime/core/session/onnxruntime_cxx_inline.h#L2014 integer_dims.push_back(dim.IsInt()); -> integer_dims.push_back(dim.AsInt()); - Bug b : vector out of range error op's input maybe a scalar and shape is empty. https://github.com/microsoft/onnxruntime/blob/main/include/onnxruntime/core/session/onnxruntime_cxx_inline.h#L1985 --------- Co-authored-by: mingyue --- include/onnxruntime/core/session/onnxruntime_cxx_inline.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/include/onnxruntime/core/session/onnxruntime_cxx_inline.h b/include/onnxruntime/core/session/onnxruntime_cxx_inline.h index a732bf169dc7a..aaef111b9f15b 100644 --- a/include/onnxruntime/core/session/onnxruntime_cxx_inline.h +++ b/include/onnxruntime/core/session/onnxruntime_cxx_inline.h @@ -1982,7 +1982,9 @@ inline ShapeInferContext::ShapeInferContext(const OrtApi* ort_api, TensorTypeAndShapeInfo type_shape_info(info); auto integer_shape = type_shape_info.GetShape(); std::vector symbolic_shape(integer_shape.size(), {}); - type_shape_info.GetSymbolicDimensions(&symbolic_shape[0], integer_shape.size()); + if (!integer_shape.empty()) { + type_shape_info.GetSymbolicDimensions(&symbolic_shape[0], integer_shape.size()); + } Shape shape; for (size_t ith = 0; ith < integer_shape.size(); ++ith) { if (symbolic_shape[ith] && std::string{symbolic_shape[ith]}.size() > 0) { @@ -2011,7 +2013,7 @@ inline Status ShapeInferContext::SetOutputShape(size_t indice, const Shape& shap for (const auto dim : shape) { if (dim.IsInt()) { - integer_dims.push_back(dim.IsInt()); + integer_dims.push_back(dim.AsInt()); symbolic_dims.push_back(""); } else { if (!dim.AsSym() || std::string{dim.AsSym()}.empty()) { From 1df9aa2f080a66e2d40b176623c3ec6add87b9f8 Mon Sep 17 00:00:00 2001 From: Scott McKay Date: Wed, 24 Jul 2024 11:04:48 +1000 Subject: [PATCH 05/57] CoreML: Add GridSample ML Program support (#21431) ### Description Add GridSample ML Program support One combination of inputs has diffs between the pytorch generated unit tests data and CoreML. Disabling until needed as investigation may take a while. ### Motivation and Context High priorities models --- .../builders/impl/gridsample_op_builder.cc | 132 ++++++++++++++++++ .../coreml/builders/op_builder_factory.cc | 2 + .../coreml/builders/op_builder_factory.h | 1 + .../providers/cpu/tensor/grid_sample_test.cc | 101 +++++++------- .../cpu/tensor/grid_sample_test_gen.py | 2 +- .../apple/coreml_supported_mlprogram_ops.md | 1 + 6 files changed, 190 insertions(+), 49 deletions(-) create mode 100644 onnxruntime/core/providers/coreml/builders/impl/gridsample_op_builder.cc diff --git a/onnxruntime/core/providers/coreml/builders/impl/gridsample_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/gridsample_op_builder.cc new file mode 100644 index 0000000000000..bfc665e0ac716 --- /dev/null +++ b/onnxruntime/core/providers/coreml/builders/impl/gridsample_op_builder.cc @@ -0,0 +1,132 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include "core/providers/common.h" +#include "core/providers/coreml/builders/helper.h" +#include "core/providers/coreml/builders/impl/base_op_builder.h" +#include "core/providers/coreml/builders/impl/builder_utils.h" +#include "core/providers/coreml/builders/model_builder.h" +#include "core/providers/coreml/builders/op_builder_factory.h" +#include "core/providers/coreml/shape_utils.h" +#include "core/providers/shared/utils/utils.h" + +namespace onnxruntime { +namespace coreml { + +namespace { +std::string_view GetMode(const NodeAttrHelper& helper) { + // opset 16 used bilinear, nearest, bicubic + // opset 20+ uses linear, nearest, cubic + // bilinear is what CoreML uses, so prefer that + // bicubic/cubic isn't supported + + const auto& mode = helper.Get("mode", "linear"); + if (mode == "linear") { + return "bilinear"; + } + + return mode; +} +} // namespace + +class GridSampleOpBuilder : public BaseOpBuilder { + Status AddToModelBuilderImpl(ModelBuilder& model_builder, const Node& node, + const logging::Logger& logger) const override; + + bool IsOpSupportedImpl(const Node& node, const OpBuilderInputParams& input_params, + const logging::Logger& logger) const override; + + bool SupportsMLProgram() const override { return true; } +}; + +Status GridSampleOpBuilder::AddToModelBuilderImpl([[maybe_unused]] ModelBuilder& model_builder, + [[maybe_unused]] const Node& node, + [[maybe_unused]] const logging::Logger& logger) const { +#if defined(COREML_ENABLE_MLPROGRAM) + using namespace CoreML::Specification::MILSpec; // NOLINT + // https://apple.github.io/coremltools/source/coremltools.converters.mil.mil.ops.defs.html#coremltools.converters.mil.mil.ops.defs.iOS15.image_resizing.resample + + const auto input_defs = node.InputDefs(); + const auto output_defs = node.OutputDefs(); + + NodeAttrHelper helper(node); + std::string mode{GetMode(helper)}; // need a std::string for use in AddScalarConstant + std::string padding_mode = helper.Get("padding_mode", "zeros"); + const bool align_corners = helper.Get("align_corners", 0); + const std::string coordinates_mode = "normalized_minus_one_to_one"; + + // adjust to coreml equivalents + if (padding_mode == "zeros") { + padding_mode = "constant"; + } + + auto op = model_builder.CreateOperation(node, "resample"); + AddOperationInput(*op, "x", input_defs[0]->Name()); + AddOperationInput(*op, "coordinates", input_defs[1]->Name()); + AddOperationInput(*op, "sampling_mode", model_builder.AddScalarConstant(op->type(), "sampling_mode", mode)); + AddOperationInput(*op, "padding_mode", model_builder.AddScalarConstant(op->type(), "padding_mode", padding_mode)); + AddOperationInput(*op, "padding_value", model_builder.AddScalarConstant(op->type(), "padding_value", 0.0f)); + AddOperationInput(*op, "coordinates_mode", + model_builder.AddScalarConstant(op->type(), "coordinates_mode", coordinates_mode)); + AddOperationInput(*op, "align_corners", model_builder.AddScalarConstant(op->type(), "align_corners", align_corners)); + + AddOperationOutput(*op, *output_defs[0]); + + model_builder.AddOperation(std::move(op)); +#endif + return Status::OK(); +} + +bool GridSampleOpBuilder::IsOpSupportedImpl(const Node& node, const OpBuilderInputParams& input_params, + const logging::Logger& logger) const { + if (!input_params.create_mlprogram) { + LOGS(logger, VERBOSE) << "GridSample is not supported."; + return false; + } + + const auto& input_defs = node.InputDefs(); + + std::vector input_shape; + if (!GetShape(*input_defs[0], input_shape, logger)) { + LOGS(logger, VERBOSE) << "GridSample: failed to get input shape"; + return false; + } + + const auto input_rank = input_shape.size(); + if (input_rank != 4) { + LOGS(logger, VERBOSE) << "GridSample only supports 4D input. Got:" << input_rank << "D"; + return false; + } + + NodeAttrHelper helper(node); + std::string_view mode = GetMode(helper); + + if (mode != "bilinear" && mode != "zeros") { + LOGS(logger, VERBOSE) << "GridSample does not support mode of " << mode; + return false; + } + + // there is one combination of settings where the unit test fails. + // The ORT unit test values are generated by pytorch so not clear if it's an issue with CoreML. + // CoreML output is consistent for CPU and non-CPU at least. + // Disabling until there's a use-case that requires this combination. + const auto& padding_mode = helper.Get("padding_mode", "zeros"); + const bool align_corners = helper.Get("align_corners", 0); + + if (mode == "bilinear" && padding_mode == "reflection" && align_corners == false) { + LOGS(logger, VERBOSE) << "GridSample does not support mode:" << mode << " padding_mode:" << padding_mode + << " align_corners:" << align_corners + << " currently due to output diffs that need to be investigated"; + return false; + } + + return true; +} + +void CreateGridSampleOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations) { + op_registrations.builders.push_back(std::make_unique()); + op_registrations.op_builder_map.emplace(op_type, op_registrations.builders.back().get()); +} + +} // namespace coreml +} // namespace onnxruntime diff --git a/onnxruntime/core/providers/coreml/builders/op_builder_factory.cc b/onnxruntime/core/providers/coreml/builders/op_builder_factory.cc index 2c06659852134..b17827f8e0532 100644 --- a/onnxruntime/core/providers/coreml/builders/op_builder_factory.cc +++ b/onnxruntime/core/providers/coreml/builders/op_builder_factory.cc @@ -130,6 +130,8 @@ static OpBuilderRegistrations CreateOpBuilderRegistrations() { CreateSplitOpBuilder("Split", op_registrations); } + CreateGridSampleOpBuilder("GridSample", op_registrations); + return op_registrations; } diff --git a/onnxruntime/core/providers/coreml/builders/op_builder_factory.h b/onnxruntime/core/providers/coreml/builders/op_builder_factory.h index 6469b4cefa5ea..a9a8ab90b0863 100644 --- a/onnxruntime/core/providers/coreml/builders/op_builder_factory.h +++ b/onnxruntime/core/providers/coreml/builders/op_builder_factory.h @@ -28,6 +28,7 @@ void CreateDepthToSpaceOpBuilder(const std::string& op_type, OpBuilderRegistrati void CreateFlattenOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations); void CreateGatherOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations); void CreateGemmOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations); +void CreateGridSampleOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations); void CreateLRNOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations); void CreatePadOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations); void CreatePoolOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations); diff --git a/onnxruntime/test/providers/cpu/tensor/grid_sample_test.cc b/onnxruntime/test/providers/cpu/tensor/grid_sample_test.cc index 5c89d6ea7bd75..540dc6dee68fb 100644 --- a/onnxruntime/test/providers/cpu/tensor/grid_sample_test.cc +++ b/onnxruntime/test/providers/cpu/tensor/grid_sample_test.cc @@ -13,6 +13,7 @@ std::vector> GetExecutionProviders(int opset std::vector> execution_providers; execution_providers.emplace_back(DefaultCpuExecutionProvider()); + #ifdef USE_CUDA if (opset_version < 20) { execution_providers.emplace_back(DefaultCudaExecutionProvider()); @@ -20,8 +21,12 @@ std::vector> GetExecutionProviders(int opset execution_providers.push_back(DefaultCudaNHWCExecutionProvider()); #endif } +#endif +#if defined(USE_COREML) + execution_providers.push_back(DefaultCoreMLExecutionProvider(/*use_mlprogram*/ true)); #endif + return execution_providers; } @@ -35,7 +40,7 @@ void RunTests(T& test, std::vector>&& execut // DO NOT edit following tests. They are generated by: // onnxruntime/test/providers/cpu/tensor/grid_sample_test_gen.py -TEST(GridsampleTest, test_grid_sample_16_4D_nearest_zeros_align_corners) { +TEST(GridSampleTest, test_grid_sample_16_4D_nearest_zeros_align_corners) { OpTester test("GridSample", 16); std::string mode = "nearest"; std::string padding_mode = "zeros"; @@ -55,7 +60,7 @@ TEST(GridsampleTest, test_grid_sample_16_4D_nearest_zeros_align_corners) { RunTests(test, GetExecutionProviders(16)); } -TEST(GridsampleTest, test_grid_sample_16_4D_nearest_zeros_no_align_corners) { +TEST(GridSampleTest, test_grid_sample_16_4D_nearest_zeros_no_align_corners) { OpTester test("GridSample", 16); std::string mode = "nearest"; std::string padding_mode = "zeros"; @@ -75,7 +80,7 @@ TEST(GridsampleTest, test_grid_sample_16_4D_nearest_zeros_no_align_corners) { RunTests(test, GetExecutionProviders(16)); } -TEST(GridsampleTest, test_grid_sample_16_4D_nearest_border_align_corners) { +TEST(GridSampleTest, test_grid_sample_16_4D_nearest_border_align_corners) { OpTester test("GridSample", 16); std::string mode = "nearest"; std::string padding_mode = "border"; @@ -95,7 +100,7 @@ TEST(GridsampleTest, test_grid_sample_16_4D_nearest_border_align_corners) { RunTests(test, GetExecutionProviders(16)); } -TEST(GridsampleTest, test_grid_sample_16_4D_nearest_border_no_align_corners) { +TEST(GridSampleTest, test_grid_sample_16_4D_nearest_border_no_align_corners) { OpTester test("GridSample", 16); std::string mode = "nearest"; std::string padding_mode = "border"; @@ -115,7 +120,7 @@ TEST(GridsampleTest, test_grid_sample_16_4D_nearest_border_no_align_corners) { RunTests(test, GetExecutionProviders(16)); } -TEST(GridsampleTest, test_grid_sample_16_4D_nearest_reflection_align_corners) { +TEST(GridSampleTest, test_grid_sample_16_4D_nearest_reflection_align_corners) { OpTester test("GridSample", 16); std::string mode = "nearest"; std::string padding_mode = "reflection"; @@ -135,7 +140,7 @@ TEST(GridsampleTest, test_grid_sample_16_4D_nearest_reflection_align_corners) { RunTests(test, GetExecutionProviders(16)); } -TEST(GridsampleTest, test_grid_sample_16_4D_nearest_reflection_no_align_corners) { +TEST(GridSampleTest, test_grid_sample_16_4D_nearest_reflection_no_align_corners) { OpTester test("GridSample", 16); std::string mode = "nearest"; std::string padding_mode = "reflection"; @@ -155,7 +160,7 @@ TEST(GridsampleTest, test_grid_sample_16_4D_nearest_reflection_no_align_corners) RunTests(test, GetExecutionProviders(16)); } -TEST(GridsampleTest, test_grid_sample_16_4D_bilinear_zeros_align_corners) { +TEST(GridSampleTest, test_grid_sample_16_4D_bilinear_zeros_align_corners) { OpTester test("GridSample", 16); std::string mode = "bilinear"; std::string padding_mode = "zeros"; @@ -175,7 +180,7 @@ TEST(GridsampleTest, test_grid_sample_16_4D_bilinear_zeros_align_corners) { RunTests(test, GetExecutionProviders(16)); } -TEST(GridsampleTest, test_grid_sample_16_4D_bilinear_zeros_no_align_corners) { +TEST(GridSampleTest, test_grid_sample_16_4D_bilinear_zeros_no_align_corners) { OpTester test("GridSample", 16); std::string mode = "bilinear"; std::string padding_mode = "zeros"; @@ -195,7 +200,7 @@ TEST(GridsampleTest, test_grid_sample_16_4D_bilinear_zeros_no_align_corners) { RunTests(test, GetExecutionProviders(16)); } -TEST(GridsampleTest, test_grid_sample_16_4D_bilinear_border_align_corners) { +TEST(GridSampleTest, test_grid_sample_16_4D_bilinear_border_align_corners) { OpTester test("GridSample", 16); std::string mode = "bilinear"; std::string padding_mode = "border"; @@ -215,7 +220,7 @@ TEST(GridsampleTest, test_grid_sample_16_4D_bilinear_border_align_corners) { RunTests(test, GetExecutionProviders(16)); } -TEST(GridsampleTest, test_grid_sample_16_4D_bilinear_border_no_align_corners) { +TEST(GridSampleTest, test_grid_sample_16_4D_bilinear_border_no_align_corners) { OpTester test("GridSample", 16); std::string mode = "bilinear"; std::string padding_mode = "border"; @@ -235,7 +240,7 @@ TEST(GridsampleTest, test_grid_sample_16_4D_bilinear_border_no_align_corners) { RunTests(test, GetExecutionProviders(16)); } -TEST(GridsampleTest, test_grid_sample_16_4D_bilinear_reflection_align_corners) { +TEST(GridSampleTest, test_grid_sample_16_4D_bilinear_reflection_align_corners) { OpTester test("GridSample", 16); std::string mode = "bilinear"; std::string padding_mode = "reflection"; @@ -255,7 +260,7 @@ TEST(GridsampleTest, test_grid_sample_16_4D_bilinear_reflection_align_corners) { RunTests(test, GetExecutionProviders(16)); } -TEST(GridsampleTest, test_grid_sample_16_4D_bilinear_reflection_no_align_corners) { +TEST(GridSampleTest, test_grid_sample_16_4D_bilinear_reflection_no_align_corners) { OpTester test("GridSample", 16); std::string mode = "bilinear"; std::string padding_mode = "reflection"; @@ -275,7 +280,7 @@ TEST(GridsampleTest, test_grid_sample_16_4D_bilinear_reflection_no_align_corners RunTests(test, GetExecutionProviders(16)); } -TEST(GridsampleTest, test_grid_sample_16_4D_bicubic_zeros_align_corners) { +TEST(GridSampleTest, test_grid_sample_16_4D_bicubic_zeros_align_corners) { OpTester test("GridSample", 16); std::string mode = "bicubic"; std::string padding_mode = "zeros"; @@ -295,7 +300,7 @@ TEST(GridsampleTest, test_grid_sample_16_4D_bicubic_zeros_align_corners) { RunTests(test, GetExecutionProviders(16)); } -TEST(GridsampleTest, test_grid_sample_16_4D_bicubic_zeros_no_align_corners) { +TEST(GridSampleTest, test_grid_sample_16_4D_bicubic_zeros_no_align_corners) { OpTester test("GridSample", 16); std::string mode = "bicubic"; std::string padding_mode = "zeros"; @@ -315,7 +320,7 @@ TEST(GridsampleTest, test_grid_sample_16_4D_bicubic_zeros_no_align_corners) { RunTests(test, GetExecutionProviders(16)); } -TEST(GridsampleTest, test_grid_sample_16_4D_bicubic_border_align_corners) { +TEST(GridSampleTest, test_grid_sample_16_4D_bicubic_border_align_corners) { OpTester test("GridSample", 16); std::string mode = "bicubic"; std::string padding_mode = "border"; @@ -335,7 +340,7 @@ TEST(GridsampleTest, test_grid_sample_16_4D_bicubic_border_align_corners) { RunTests(test, GetExecutionProviders(16)); } -TEST(GridsampleTest, test_grid_sample_16_4D_bicubic_border_no_align_corners) { +TEST(GridSampleTest, test_grid_sample_16_4D_bicubic_border_no_align_corners) { OpTester test("GridSample", 16); std::string mode = "bicubic"; std::string padding_mode = "border"; @@ -355,7 +360,7 @@ TEST(GridsampleTest, test_grid_sample_16_4D_bicubic_border_no_align_corners) { RunTests(test, GetExecutionProviders(16)); } -TEST(GridsampleTest, test_grid_sample_16_4D_bicubic_reflection_align_corners) { +TEST(GridSampleTest, test_grid_sample_16_4D_bicubic_reflection_align_corners) { OpTester test("GridSample", 16); std::string mode = "bicubic"; std::string padding_mode = "reflection"; @@ -375,7 +380,7 @@ TEST(GridsampleTest, test_grid_sample_16_4D_bicubic_reflection_align_corners) { RunTests(test, GetExecutionProviders(16)); } -TEST(GridsampleTest, test_grid_sample_16_4D_bicubic_reflection_no_align_corners) { +TEST(GridSampleTest, test_grid_sample_16_4D_bicubic_reflection_no_align_corners) { OpTester test("GridSample", 16); std::string mode = "bicubic"; std::string padding_mode = "reflection"; @@ -395,7 +400,7 @@ TEST(GridsampleTest, test_grid_sample_16_4D_bicubic_reflection_no_align_corners) RunTests(test, GetExecutionProviders(16)); } -TEST(GridsampleTest, test_grid_sample_20_4D_nearest_zeros_align_corners) { +TEST(GridSampleTest, test_grid_sample_20_4D_nearest_zeros_align_corners) { OpTester test("GridSample", 20); std::string mode = "nearest"; std::string padding_mode = "zeros"; @@ -415,7 +420,7 @@ TEST(GridsampleTest, test_grid_sample_20_4D_nearest_zeros_align_corners) { RunTests(test, GetExecutionProviders(20)); } -TEST(GridsampleTest, test_grid_sample_20_5D_nearest_zeros_align_corners) { +TEST(GridSampleTest, test_grid_sample_20_5D_nearest_zeros_align_corners) { OpTester test("GridSample", 20); std::string mode = "nearest"; std::string padding_mode = "zeros"; @@ -435,7 +440,7 @@ TEST(GridsampleTest, test_grid_sample_20_5D_nearest_zeros_align_corners) { RunTests(test, GetExecutionProviders(20)); } -TEST(GridsampleTest, test_grid_sample_20_4D_nearest_zeros_no_align_corners) { +TEST(GridSampleTest, test_grid_sample_20_4D_nearest_zeros_no_align_corners) { OpTester test("GridSample", 20); std::string mode = "nearest"; std::string padding_mode = "zeros"; @@ -455,7 +460,7 @@ TEST(GridsampleTest, test_grid_sample_20_4D_nearest_zeros_no_align_corners) { RunTests(test, GetExecutionProviders(20)); } -TEST(GridsampleTest, test_grid_sample_20_5D_nearest_zeros_no_align_corners) { +TEST(GridSampleTest, test_grid_sample_20_5D_nearest_zeros_no_align_corners) { OpTester test("GridSample", 20); std::string mode = "nearest"; std::string padding_mode = "zeros"; @@ -475,7 +480,7 @@ TEST(GridsampleTest, test_grid_sample_20_5D_nearest_zeros_no_align_corners) { RunTests(test, GetExecutionProviders(20)); } -TEST(GridsampleTest, test_grid_sample_20_4D_nearest_border_align_corners) { +TEST(GridSampleTest, test_grid_sample_20_4D_nearest_border_align_corners) { OpTester test("GridSample", 20); std::string mode = "nearest"; std::string padding_mode = "border"; @@ -495,7 +500,7 @@ TEST(GridsampleTest, test_grid_sample_20_4D_nearest_border_align_corners) { RunTests(test, GetExecutionProviders(20)); } -TEST(GridsampleTest, test_grid_sample_20_5D_nearest_border_align_corners) { +TEST(GridSampleTest, test_grid_sample_20_5D_nearest_border_align_corners) { OpTester test("GridSample", 20); std::string mode = "nearest"; std::string padding_mode = "border"; @@ -515,7 +520,7 @@ TEST(GridsampleTest, test_grid_sample_20_5D_nearest_border_align_corners) { RunTests(test, GetExecutionProviders(20)); } -TEST(GridsampleTest, test_grid_sample_20_4D_nearest_border_no_align_corners) { +TEST(GridSampleTest, test_grid_sample_20_4D_nearest_border_no_align_corners) { OpTester test("GridSample", 20); std::string mode = "nearest"; std::string padding_mode = "border"; @@ -535,7 +540,7 @@ TEST(GridsampleTest, test_grid_sample_20_4D_nearest_border_no_align_corners) { RunTests(test, GetExecutionProviders(20)); } -TEST(GridsampleTest, test_grid_sample_20_5D_nearest_border_no_align_corners) { +TEST(GridSampleTest, test_grid_sample_20_5D_nearest_border_no_align_corners) { OpTester test("GridSample", 20); std::string mode = "nearest"; std::string padding_mode = "border"; @@ -555,7 +560,7 @@ TEST(GridsampleTest, test_grid_sample_20_5D_nearest_border_no_align_corners) { RunTests(test, GetExecutionProviders(20)); } -TEST(GridsampleTest, test_grid_sample_20_4D_nearest_reflection_align_corners) { +TEST(GridSampleTest, test_grid_sample_20_4D_nearest_reflection_align_corners) { OpTester test("GridSample", 20); std::string mode = "nearest"; std::string padding_mode = "reflection"; @@ -575,7 +580,7 @@ TEST(GridsampleTest, test_grid_sample_20_4D_nearest_reflection_align_corners) { RunTests(test, GetExecutionProviders(20)); } -TEST(GridsampleTest, test_grid_sample_20_5D_nearest_reflection_align_corners) { +TEST(GridSampleTest, test_grid_sample_20_5D_nearest_reflection_align_corners) { OpTester test("GridSample", 20); std::string mode = "nearest"; std::string padding_mode = "reflection"; @@ -595,7 +600,7 @@ TEST(GridsampleTest, test_grid_sample_20_5D_nearest_reflection_align_corners) { RunTests(test, GetExecutionProviders(20)); } -TEST(GridsampleTest, test_grid_sample_20_4D_nearest_reflection_no_align_corners) { +TEST(GridSampleTest, test_grid_sample_20_4D_nearest_reflection_no_align_corners) { OpTester test("GridSample", 20); std::string mode = "nearest"; std::string padding_mode = "reflection"; @@ -615,7 +620,7 @@ TEST(GridsampleTest, test_grid_sample_20_4D_nearest_reflection_no_align_corners) RunTests(test, GetExecutionProviders(20)); } -TEST(GridsampleTest, test_grid_sample_20_5D_nearest_reflection_no_align_corners) { +TEST(GridSampleTest, test_grid_sample_20_5D_nearest_reflection_no_align_corners) { OpTester test("GridSample", 20); std::string mode = "nearest"; std::string padding_mode = "reflection"; @@ -635,7 +640,7 @@ TEST(GridsampleTest, test_grid_sample_20_5D_nearest_reflection_no_align_corners) RunTests(test, GetExecutionProviders(20)); } -TEST(GridsampleTest, test_grid_sample_20_4D_bilinear_zeros_align_corners) { +TEST(GridSampleTest, test_grid_sample_20_4D_bilinear_zeros_align_corners) { OpTester test("GridSample", 20); std::string mode = "linear"; std::string padding_mode = "zeros"; @@ -655,7 +660,7 @@ TEST(GridsampleTest, test_grid_sample_20_4D_bilinear_zeros_align_corners) { RunTests(test, GetExecutionProviders(20)); } -TEST(GridsampleTest, test_grid_sample_20_5D_bilinear_zeros_align_corners) { +TEST(GridSampleTest, test_grid_sample_20_5D_bilinear_zeros_align_corners) { OpTester test("GridSample", 20); std::string mode = "linear"; std::string padding_mode = "zeros"; @@ -675,7 +680,7 @@ TEST(GridsampleTest, test_grid_sample_20_5D_bilinear_zeros_align_corners) { RunTests(test, GetExecutionProviders(20)); } -TEST(GridsampleTest, test_grid_sample_20_4D_bilinear_zeros_no_align_corners) { +TEST(GridSampleTest, test_grid_sample_20_4D_bilinear_zeros_no_align_corners) { OpTester test("GridSample", 20); std::string mode = "linear"; std::string padding_mode = "zeros"; @@ -695,7 +700,7 @@ TEST(GridsampleTest, test_grid_sample_20_4D_bilinear_zeros_no_align_corners) { RunTests(test, GetExecutionProviders(20)); } -TEST(GridsampleTest, test_grid_sample_20_5D_bilinear_zeros_no_align_corners) { +TEST(GridSampleTest, test_grid_sample_20_5D_bilinear_zeros_no_align_corners) { OpTester test("GridSample", 20); std::string mode = "linear"; std::string padding_mode = "zeros"; @@ -715,7 +720,7 @@ TEST(GridsampleTest, test_grid_sample_20_5D_bilinear_zeros_no_align_corners) { RunTests(test, GetExecutionProviders(20)); } -TEST(GridsampleTest, test_grid_sample_20_4D_bilinear_border_align_corners) { +TEST(GridSampleTest, test_grid_sample_20_4D_bilinear_border_align_corners) { OpTester test("GridSample", 20); std::string mode = "linear"; std::string padding_mode = "border"; @@ -735,7 +740,7 @@ TEST(GridsampleTest, test_grid_sample_20_4D_bilinear_border_align_corners) { RunTests(test, GetExecutionProviders(20)); } -TEST(GridsampleTest, test_grid_sample_20_5D_bilinear_border_align_corners) { +TEST(GridSampleTest, test_grid_sample_20_5D_bilinear_border_align_corners) { OpTester test("GridSample", 20); std::string mode = "linear"; std::string padding_mode = "border"; @@ -755,7 +760,7 @@ TEST(GridsampleTest, test_grid_sample_20_5D_bilinear_border_align_corners) { RunTests(test, GetExecutionProviders(20)); } -TEST(GridsampleTest, test_grid_sample_20_4D_bilinear_border_no_align_corners) { +TEST(GridSampleTest, test_grid_sample_20_4D_bilinear_border_no_align_corners) { OpTester test("GridSample", 20); std::string mode = "linear"; std::string padding_mode = "border"; @@ -775,7 +780,7 @@ TEST(GridsampleTest, test_grid_sample_20_4D_bilinear_border_no_align_corners) { RunTests(test, GetExecutionProviders(20)); } -TEST(GridsampleTest, test_grid_sample_20_5D_bilinear_border_no_align_corners) { +TEST(GridSampleTest, test_grid_sample_20_5D_bilinear_border_no_align_corners) { OpTester test("GridSample", 20); std::string mode = "linear"; std::string padding_mode = "border"; @@ -795,7 +800,7 @@ TEST(GridsampleTest, test_grid_sample_20_5D_bilinear_border_no_align_corners) { RunTests(test, GetExecutionProviders(20)); } -TEST(GridsampleTest, test_grid_sample_20_4D_bilinear_reflection_align_corners) { +TEST(GridSampleTest, test_grid_sample_20_4D_bilinear_reflection_align_corners) { OpTester test("GridSample", 20); std::string mode = "linear"; std::string padding_mode = "reflection"; @@ -815,7 +820,7 @@ TEST(GridsampleTest, test_grid_sample_20_4D_bilinear_reflection_align_corners) { RunTests(test, GetExecutionProviders(20)); } -TEST(GridsampleTest, test_grid_sample_20_5D_bilinear_reflection_align_corners) { +TEST(GridSampleTest, test_grid_sample_20_5D_bilinear_reflection_align_corners) { OpTester test("GridSample", 20); std::string mode = "linear"; std::string padding_mode = "reflection"; @@ -835,7 +840,7 @@ TEST(GridsampleTest, test_grid_sample_20_5D_bilinear_reflection_align_corners) { RunTests(test, GetExecutionProviders(20)); } -TEST(GridsampleTest, test_grid_sample_20_4D_bilinear_reflection_no_align_corners) { +TEST(GridSampleTest, test_grid_sample_20_4D_bilinear_reflection_no_align_corners) { OpTester test("GridSample", 20); std::string mode = "linear"; std::string padding_mode = "reflection"; @@ -855,7 +860,7 @@ TEST(GridsampleTest, test_grid_sample_20_4D_bilinear_reflection_no_align_corners RunTests(test, GetExecutionProviders(20)); } -TEST(GridsampleTest, test_grid_sample_20_5D_bilinear_reflection_no_align_corners) { +TEST(GridSampleTest, test_grid_sample_20_5D_bilinear_reflection_no_align_corners) { OpTester test("GridSample", 20); std::string mode = "linear"; std::string padding_mode = "reflection"; @@ -875,7 +880,7 @@ TEST(GridsampleTest, test_grid_sample_20_5D_bilinear_reflection_no_align_corners RunTests(test, GetExecutionProviders(20)); } -TEST(GridsampleTest, test_grid_sample_20_4D_bicubic_zeros_align_corners) { +TEST(GridSampleTest, test_grid_sample_20_4D_bicubic_zeros_align_corners) { OpTester test("GridSample", 20); std::string mode = "cubic"; std::string padding_mode = "zeros"; @@ -895,7 +900,7 @@ TEST(GridsampleTest, test_grid_sample_20_4D_bicubic_zeros_align_corners) { RunTests(test, GetExecutionProviders(20)); } -TEST(GridsampleTest, test_grid_sample_20_4D_bicubic_zeros_no_align_corners) { +TEST(GridSampleTest, test_grid_sample_20_4D_bicubic_zeros_no_align_corners) { OpTester test("GridSample", 20); std::string mode = "cubic"; std::string padding_mode = "zeros"; @@ -915,7 +920,7 @@ TEST(GridsampleTest, test_grid_sample_20_4D_bicubic_zeros_no_align_corners) { RunTests(test, GetExecutionProviders(20)); } -TEST(GridsampleTest, test_grid_sample_20_4D_bicubic_border_align_corners) { +TEST(GridSampleTest, test_grid_sample_20_4D_bicubic_border_align_corners) { OpTester test("GridSample", 20); std::string mode = "cubic"; std::string padding_mode = "border"; @@ -935,7 +940,7 @@ TEST(GridsampleTest, test_grid_sample_20_4D_bicubic_border_align_corners) { RunTests(test, GetExecutionProviders(20)); } -TEST(GridsampleTest, test_grid_sample_20_4D_bicubic_border_no_align_corners) { +TEST(GridSampleTest, test_grid_sample_20_4D_bicubic_border_no_align_corners) { OpTester test("GridSample", 20); std::string mode = "cubic"; std::string padding_mode = "border"; @@ -955,7 +960,7 @@ TEST(GridsampleTest, test_grid_sample_20_4D_bicubic_border_no_align_corners) { RunTests(test, GetExecutionProviders(20)); } -TEST(GridsampleTest, test_grid_sample_20_4D_bicubic_reflection_align_corners) { +TEST(GridSampleTest, test_grid_sample_20_4D_bicubic_reflection_align_corners) { OpTester test("GridSample", 20); std::string mode = "cubic"; std::string padding_mode = "reflection"; @@ -975,7 +980,7 @@ TEST(GridsampleTest, test_grid_sample_20_4D_bicubic_reflection_align_corners) { RunTests(test, GetExecutionProviders(20)); } -TEST(GridsampleTest, test_grid_sample_20_4D_bicubic_reflection_no_align_corners) { +TEST(GridSampleTest, test_grid_sample_20_4D_bicubic_reflection_no_align_corners) { OpTester test("GridSample", 20); std::string mode = "cubic"; std::string padding_mode = "reflection"; diff --git a/onnxruntime/test/providers/cpu/tensor/grid_sample_test_gen.py b/onnxruntime/test/providers/cpu/tensor/grid_sample_test_gen.py index c60e55617774f..c7e263ca3f654 100644 --- a/onnxruntime/test/providers/cpu/tensor/grid_sample_test_gen.py +++ b/onnxruntime/test/providers/cpu/tensor/grid_sample_test_gen.py @@ -58,7 +58,7 @@ onnx_align_corners = 1 if align_corners else 0 test_name = f"test_grid_sample_{opset_version}_{ndim}D_{mode}_{padding_mode}_{'align_corners' if align_corners else 'no_align_corners'}" - print(f"TEST(GridsampleTest, {test_name}) {{") + print(f"TEST(GridSampleTest, {test_name}) {{") print(f'OpTester test("GridSample", {opset_version});') print(f'std::string mode = "{onnx_mode}";') print(f'std::string padding_mode = "{padding_mode}";') diff --git a/tools/ci_build/github/apple/coreml_supported_mlprogram_ops.md b/tools/ci_build/github/apple/coreml_supported_mlprogram_ops.md index c33184686c932..b65b0f64686a9 100644 --- a/tools/ci_build/github/apple/coreml_supported_mlprogram_ops.md +++ b/tools/ci_build/github/apple/coreml_supported_mlprogram_ops.md @@ -11,6 +11,7 @@ Keep in sync with doco generated from /docs/execution-providers/CoreML-Execution |ai.onnx:Gemm|Input B must be constant.| |ai.onnx:GlobalAveragePool|Only 2D Pool is supported currently. 3D and 5D support can be added if needed.| |ai.onnx:GlobalMaxPool|Only 2D Pool is supported currently. 3D and 5D support can be added if needed.| +|ai.onnx:GridSample|4D input.
'mode' of 'linear' or 'zeros'.
(mode==linear && padding_mode==reflection && align_corners==0) is not supported.| |ai.onnx:MatMul|Only support for transA == 0, alpha == 1.0 and beta == 1.0 is currently implemented.| |ai.onnx:MaxPool|Only 2D Pool is supported currently. 3D and 5D support can be added if needed.| |ai.onnx:Mul|| From 0274008b6baa89a907527169a888efcd58646f5b Mon Sep 17 00:00:00 2001 From: Wanming Lin Date: Wed, 24 Jul 2024 09:51:49 +0800 Subject: [PATCH 06/57] [WebNN EP] ConvTranspose should calculate the pads or output shape (#21292) This PR adds the missing pads and output shape calculation for ConvTranspose. Per ONNX spec: - If the output shape is explicitly provided, compute the pads. - Otherwise compute the output shape, as well as the pads if the auto_pad attribute is SAME_UPPER/SAME_LOWER. --- .../webnn/builders/impl/builder_utils.cc | 88 +++++++++++++++++++ .../webnn/builders/impl/builder_utils.h | 13 +++ .../webnn/builders/impl/conv_op_builder.cc | 81 ++++++----------- 3 files changed, 126 insertions(+), 56 deletions(-) diff --git a/onnxruntime/core/providers/webnn/builders/impl/builder_utils.cc b/onnxruntime/core/providers/webnn/builders/impl/builder_utils.cc index d147ffbbd181f..113cc3df5438d 100644 --- a/onnxruntime/core/providers/webnn/builders/impl/builder_utils.cc +++ b/onnxruntime/core/providers/webnn/builders/impl/builder_utils.cc @@ -68,5 +68,93 @@ common::Status HandleAutoPad(const std::vector input_shape, return Status::OK(); } +common::Status ComputeConvTransposePadAndOutputShape( + const int64_t in_size, + const int64_t stride, + const int64_t kernel, + const int64_t dilation, + const int64_t adj, + AutoPadType pad_type, + int64_t& pad_head, + int64_t& pad_tail, + int64_t& out_size) { + // Output shape is explicitly provided - pad values will have to be computed. + if (out_size != -1) { + // total pad + auto total_pad = ComputeTotalPad(in_size, stride, adj, kernel, dilation, out_size); + DistributePadding(pad_type, total_pad, pad_head, pad_tail); + return Status::OK(); + } + + // Output shape is not provided - it needs to be computed along with pad values (if applicable). + + // Compute padding if the auto_pad attribute is SAME_UPPER/SAME_LOWER. + if (pad_type == AutoPadType::SAME_UPPER || pad_type == AutoPadType::SAME_LOWER) { + // The ONNX spec says if `auto_pad` attribute is set, pad until the `out_size` + // is `in_size * stride`. + auto total_pad = ComputeTotalPad(in_size, stride, adj, + kernel, dilation, /*out_size = */ in_size * stride); + DistributePadding(pad_type, total_pad, pad_head, pad_tail); + } + + out_size = (in_size - 1) * stride + adj + (kernel - 1) * dilation + 1 - pad_head - pad_tail; + + return Status::OK(); +} + +common::Status ComputeConvTransposePadsAndOutputShape(const std::vector input_shape, + const int64_t weight_size_y, + const int64_t weight_size_x, + const std::vector& onnx_pads, + const std::vector& onnx_strides, + const std::vector& onnx_dilations, + const std::vector& onnx_output_padding, + AutoPadType auto_pad_type, + std::vector& pads_out, + std::vector& output_shape_out, + bool use_nchw) { + const int64_t input_size_y = use_nchw ? input_shape[2] : input_shape[1]; + const int64_t input_size_x = use_nchw ? input_shape[3] : input_shape[2]; + const int64_t stride_y = onnx_strides[0]; + const int64_t stride_x = onnx_strides[1]; + const int64_t dilation_y = onnx_dilations[0]; + const int64_t dilation_x = onnx_dilations[1]; + const int64_t output_padding_y = onnx_output_padding[0]; + const int64_t output_padding_x = onnx_output_padding[1]; + + int64_t padding_top = onnx_pads[0]; + int64_t padding_bottom = onnx_pads[2]; + int64_t padding_left = onnx_pads[1]; + int64_t padding_right = onnx_pads[3]; + int64_t output_shape_out_y = output_shape_out[0]; + int64_t output_shape_out_x = output_shape_out[1]; + ORT_RETURN_IF_ERROR(ComputeConvTransposePadAndOutputShape( + input_size_y, + stride_y, + weight_size_y, + dilation_y, + output_padding_y, + auto_pad_type, + padding_top, + padding_bottom, + output_shape_out_y)); + ORT_RETURN_IF_ERROR(ComputeConvTransposePadAndOutputShape( + input_size_x, + stride_x, + weight_size_x, + dilation_x, + output_padding_x, + auto_pad_type, + padding_left, + padding_right, + output_shape_out_x)); + + // WebNN only needs the height and width of the output shape. + output_shape_out = {output_shape_out_y, output_shape_out_x}; + pads_out = {padding_top, padding_left, padding_bottom, padding_right}; + + return Status::OK(); +} + } // namespace webnn } // namespace onnxruntime diff --git a/onnxruntime/core/providers/webnn/builders/impl/builder_utils.h b/onnxruntime/core/providers/webnn/builders/impl/builder_utils.h index cb7c3c6955664..5a156c96c4852 100644 --- a/onnxruntime/core/providers/webnn/builders/impl/builder_utils.h +++ b/onnxruntime/core/providers/webnn/builders/impl/builder_utils.h @@ -24,5 +24,18 @@ common::Status HandleAutoPad(const std::vector input_shape, std::vector& pads_out, bool use_nchw) ORT_MUST_USE_RESULT; +// Compute pads and output shape for ConvTranspose. +common::Status ComputeConvTransposePadsAndOutputShape(const std::vector input_shape, + const int64_t weight_size_y, + const int64_t weight_size_x, + const std::vector& onnx_pads, + const std::vector& onnx_strides, + const std::vector& onnx_dilations, + const std::vector& onnx_output_padding, + AutoPadType auto_pad_type, + std::vector& pads_out, + std::vector& output_shape_out, + bool use_nchw) ORT_MUST_USE_RESULT; + } // namespace webnn } // namespace onnxruntime diff --git a/onnxruntime/core/providers/webnn/builders/impl/conv_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/conv_op_builder.cc index 847db6a9975c6..320aaa03930fd 100644 --- a/onnxruntime/core/providers/webnn/builders/impl/conv_op_builder.cc +++ b/onnxruntime/core/providers/webnn/builders/impl/conv_op_builder.cc @@ -56,72 +56,41 @@ common::Status SetConvBaseOptions(ModelBuilder& model_builder, // Add Padding. AutoPadType auto_pad_type = StringToAutoPadType(helper.Get("auto_pad", "NOTSET")); - if (node.OpType() == "Conv") { + std::vector pads_out; + if (node.OpType() == "Conv" || node.OpType() == "ConvInteger") { // Calculate explicit padding for autoPad. if (AutoPadType::SAME_UPPER == auto_pad_type || AutoPadType::SAME_LOWER == auto_pad_type) { - std::vector pads_out; ORT_RETURN_IF_ERROR(HandleAutoPad(input_shape, weight_shape[2], weight_shape[3], pads, strides, dilations, auto_pad_type, pads_out, !is_nhwc)); pads = pads_out; } } else if (node.OpType() == "ConvTranspose") { - // When the 'output_shape' is specificed, the 'output_padding' values - // in options.outputPadding are ignored. - std::vector dims; - std::vector output_padding{0, 0}; - if (helper.HasAttr("output_shape")) { - // Default value of 'output_shape' will be ignored as we already check if it existed. - dims = helper.Get("output_shape", std::vector{-1, -1}); - // Extract the height and width. - std::vector output_shape; - if (dims.size() == 1 && is_conv1d) { // ConvTranspose 1d - output_shape = {dims[0], 1}; - } else if (dims.size() == 2 && !is_conv1d) { - output_shape = dims; - } else { - return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Invalid output shape"); - } - // Padding values are auto generated. - if (helper.HasAttr("kernel_shape")) { - std::vector kernel_shape = helper.Get("kernel_shape", std::vector{-1, -1}); - if (is_conv1d) { // ConvTranspose 1d - kernel_shape.push_back(1); - } - std::vector total_padding(2); - for (size_t i = 0; i < 2; i++) { - // Get the dimensions of H and W. - // For NHWC layout, the dimensions of H and W correspond to index 1 and 2. - // For NCHW layout, the dimensions of H and W correspond to index 2 and 3. - if (is_nhwc) { - total_padding[i] = strides[i] * (input_shape[i + 1] - 1) + output_padding[i] + - ((kernel_shape[i] - 1) * dilations[i] + 1) - output_shape[i]; - } else { - total_padding[i] = strides[i] * (input_shape[i + 2] - 1) + output_padding[i] + - ((kernel_shape[i] - 1) * dilations[i] + 1) - output_shape[i]; - } - } - AutoPadType auto_pad_type = StringToAutoPadType(helper.Get("auto_pad", "NOTSET")); - if (AutoPadType::SAME_UPPER == auto_pad_type || AutoPadType::SAME_LOWER == auto_pad_type) { - pads[0] = total_padding[0] / 2; - pads[1] = total_padding[0] - pads[0]; - pads[2] = total_padding[1] / 2; - pads[3] = total_padding[1] - pads[2]; - if (AutoPadType::SAME_LOWER == auto_pad_type) { - std::swap(pads[0], pads[1]); - std::swap(pads[2], pads[3]); - } - } - } + std::vector output_shape = helper.Get("output_shape", std::vector{-1, -1}); + // Appending 1's if it is ConvTranspose 1d and output shape is provided. + if (output_shape.size() == 1 && is_conv1d && output_shape[0] != -1) { + output_shape.push_back(1); + } + + std::vector output_padding = helper.Get("output_padding", std::vector{0, 0}); + // Appending 0's if it is ConvTranspose 1d. + if (output_padding.size() == 1 && is_conv1d) { + output_padding.push_back(0); + } + options.set("outputPadding", emscripten::val::array(GetVecUint32FromVecInt64(output_padding))); + + // If output shape is explicitly provided, compute the pads. + // Otherwise compute the output shape, as well as the pads if the auto_pad attribute is SAME_UPPER/SAME_LOWER. + ORT_RETURN_IF_ERROR(ComputeConvTransposePadsAndOutputShape(input_shape, weight_shape[2], weight_shape[3], + pads, strides, dilations, output_padding, + auto_pad_type, pads_out, output_shape, !is_nhwc)); + + if (output_shape[0] != -1 && output_shape[1] != -1) { options.set("outputSizes", emscripten::val::array(GetVecUint32FromVecInt64(output_shape))); - } else { - output_padding = helper.Get("output_padding", std::vector{0, 0}); - if (output_padding.size() == 1 && is_conv1d) { // ConvTranspose 1d - output_padding.push_back(0); - } - options.set("outputPadding", emscripten::val::array(GetVecUint32FromVecInt64(output_padding))); } + pads = pads_out; } else { - return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "conv_op_builder only supports Op Conv and ConvTranspose."); + return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, + "conv_op_builder only supports Op Conv, ConvInteger and ConvTranspose."); } const auto group = helper.Get("group", static_cast(1)); From 6794dfd9412ec66977aa4ee01b7ea01a6dfa7296 Mon Sep 17 00:00:00 2001 From: Chester Liu <4710575+skyline75489@users.noreply.github.com> Date: Wed, 24 Jul 2024 13:41:09 +0800 Subject: [PATCH 07/57] [QNN EP] Improve QNN error reporting using the error message (#21458) ### Description Massively improve the QNN error reporting by invoking `QnnError_getMessage` and returning the error message. ### Motivation and Context Example error message before this change: ```text QNN SetupBackend failed Failed to create device. Error: 14001 ``` After: ```text QNN SetupBackend failed Failed to create device. Error: QNN_DEVICE_ERROR_INVALID_CONFIG: Invalid config values ``` --- .../qnn/builder/qnn_backend_manager.cc | 55 +++++++++++-------- .../qnn/builder/qnn_backend_manager.h | 1 + 2 files changed, 33 insertions(+), 23 deletions(-) diff --git a/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.cc b/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.cc index f44efb1eba6db..0005869f13f66 100644 --- a/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.cc +++ b/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.cc @@ -279,7 +279,7 @@ Status QnnBackendManager::InitializeQnnLog() { } } - ORT_RETURN_IF(QNN_BACKEND_NO_ERROR != result, "Failed to initialize logging in the QNN backend"); + ORT_RETURN_IF(QNN_BACKEND_NO_ERROR != result, "Failed to initialize logging in the QNN backend. Error: ", QnnErrorHandleToString(result)); return Status::OK(); } @@ -320,7 +320,7 @@ Status QnnBackendManager::UpdateQnnLogLevel(logging::Severity ort_log_level) { LOGS(*logger_, ERROR) << "Invalid log handle provided to QnnLog_setLogLevel."; } } - ORT_RETURN_IF(QNN_BACKEND_NO_ERROR != result, "Failed to set log level in Qnn backend"); + ORT_RETURN_IF(QNN_BACKEND_NO_ERROR != result, "Failed to set log level in Qnn backend. Error: ", QnnErrorHandleToString(result)); return Status::OK(); } @@ -330,8 +330,8 @@ Status QnnBackendManager::InitializeBackend() { return Status::OK(); } - auto result = qnn_interface_.backendCreate(log_handle_, (const QnnBackend_Config_t**)backend_config_, &backend_handle_); - ORT_RETURN_IF(QNN_BACKEND_NO_ERROR != result, "Failed to initialize backend"); + Qnn_ErrorHandle_t result = qnn_interface_.backendCreate(log_handle_, (const QnnBackend_Config_t**)backend_config_, &backend_handle_); + ORT_RETURN_IF(QNN_BACKEND_NO_ERROR != result, "Failed to initialize backend. Error: ", QnnErrorHandleToString(result)); backend_initialized_ = true; return Status::OK(); @@ -406,9 +406,9 @@ Status QnnBackendManager::CreateDevice() { LOGS_DEFAULT(INFO) << "Create device."; if (nullptr != qnn_interface_.deviceCreate) { - auto result = qnn_interface_.deviceCreate(log_handle_, device_configs_builder.GetQnnConfigs(), &device_handle_); + Qnn_ErrorHandle_t result = qnn_interface_.deviceCreate(log_handle_, device_configs_builder.GetQnnConfigs(), &device_handle_); if (QNN_SUCCESS != result) { - return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Failed to create device. Error: ", result); + return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Failed to create device. Error: ", QnnErrorHandleToString(result)); } } device_created_ = true; @@ -422,9 +422,9 @@ Status QnnBackendManager::ReleaseDevice() { } if (nullptr != qnn_interface_.deviceFree) { - auto result = qnn_interface_.deviceFree(device_handle_); + Qnn_ErrorHandle_t result = qnn_interface_.deviceFree(device_handle_); if (QNN_SUCCESS != result) { - return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Failed to release device. Error: ", result); + return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Failed to release device. Error: ", QnnErrorHandleToString(result)); } } @@ -451,8 +451,8 @@ Status QnnBackendManager::InitializeProfiling() { } else if (ProfilingLevel::DETAILED == profiling_level_merge_) { qnn_profile_level = QNN_PROFILE_LEVEL_DETAILED; } - auto result = qnn_interface_.profileCreate(backend_handle_, qnn_profile_level, &profile_backend_handle_); - ORT_RETURN_IF(QNN_PROFILE_NO_ERROR != result, "Failed to create QNN profile!"); + Qnn_ErrorHandle_t result = qnn_interface_.profileCreate(backend_handle_, qnn_profile_level, &profile_backend_handle_); + ORT_RETURN_IF(QNN_PROFILE_NO_ERROR != result, "Failed to create QNN profile! Error: ", QnnErrorHandleToString(result)); return Status::OK(); } @@ -525,13 +525,13 @@ Status QnnBackendManager::CreateContext() { const QnnContext_Config_t* context_configs[] = {&qnn_context_config, nullptr}; Qnn_ContextHandle_t context = nullptr; - auto result = qnn_interface_.contextCreate(backend_handle_, - device_handle_, - context_configs, - &context); + Qnn_ErrorHandle_t result = qnn_interface_.contextCreate(backend_handle_, + device_handle_, + context_configs, + &context); contexts_.push_back(context); - ORT_RETURN_IF(QNN_CONTEXT_NO_ERROR != result, "Failed to create context."); + ORT_RETURN_IF(QNN_CONTEXT_NO_ERROR != result, "Failed to create context. Error: ", QnnErrorHandleToString(result)); context_created_ = true; return Status::OK(); @@ -544,7 +544,7 @@ Status QnnBackendManager::ReleaseContext() { bool failed = false; for (auto context : contexts_) { - auto result = qnn_interface_.contextFree(context, nullptr); + Qnn_ErrorHandle_t result = qnn_interface_.contextFree(context, nullptr); if (QNN_CONTEXT_NO_ERROR != result) { failed = true; } @@ -566,7 +566,7 @@ std::unique_ptr QnnBackendManager::GetContextBinaryBuffer(uint6 // Generate all graphs in one single context Qnn_ErrorHandle_t rt = qnn_interface_.contextGetBinarySize(contexts_[0], &required_buffer_size); if (QNN_CONTEXT_NO_ERROR != rt) { - LOGS(*logger_, ERROR) << "Failed to get QNN context binary size. Error code: " << rt; + LOGS(*logger_, ERROR) << "Failed to get QNN context binary size. Error: " << QnnErrorHandleToString(rt); return nullptr; } @@ -581,7 +581,7 @@ std::unique_ptr QnnBackendManager::GetContextBinaryBuffer(uint6 required_buffer_size, &written_buffer_size); if (QNN_CONTEXT_NO_ERROR != rt) { - LOGS(*logger_, ERROR) << "Failed to get context binary."; + LOGS(*logger_, ERROR) << "Failed to get context binary. Error: " << QnnErrorHandleToString(rt); return nullptr; } @@ -1014,8 +1014,8 @@ Status QnnBackendManager::ExtractBackendProfilingInfo() { const QnnProfile_EventId_t* profile_events{nullptr}; uint32_t num_events{0}; - auto result = qnn_interface_.profileGetEvents(profile_backend_handle_, &profile_events, &num_events); - ORT_RETURN_IF(QNN_PROFILE_NO_ERROR != result, "Failed to get profile events."); + Qnn_ErrorHandle_t result = qnn_interface_.profileGetEvents(profile_backend_handle_, &profile_events, &num_events); + ORT_RETURN_IF(QNN_PROFILE_NO_ERROR != result, "Failed to get profile events. Error: ", QnnErrorHandleToString(result)); if (num_events > 0) { LOGS(*logger_, VERBOSE) << "profile_events: " << profile_events << " num_events: " << num_events; @@ -1073,8 +1073,8 @@ Status QnnBackendManager::ExtractProfilingSubEvents( bool tracelogging_provider_ep_enabled) { const QnnProfile_EventId_t* profile_sub_events{nullptr}; uint32_t num_sub_events{0}; - auto result = qnn_interface_.profileGetSubEvents(profile_event_id, &profile_sub_events, &num_sub_events); - ORT_RETURN_IF(QNN_PROFILE_NO_ERROR != result, "Failed to get profile sub events."); + Qnn_ErrorHandle_t result = qnn_interface_.profileGetSubEvents(profile_event_id, &profile_sub_events, &num_sub_events); + ORT_RETURN_IF(QNN_PROFILE_NO_ERROR != result, "Failed to get profile sub events. Error: ", QnnErrorHandleToString(result)); if (num_sub_events > 0) { LOGS(*logger_, VERBOSE) << "profile_sub_events: " << profile_sub_events << " num_sub_events: " << num_sub_events; @@ -1113,7 +1113,7 @@ Status QnnBackendManager::ExtractProfilingEventBasic( std::ofstream& outfile, bool tracelogging_provider_ep_enabled) { QnnProfile_EventData_t event_data; - auto result = qnn_interface_.profileGetEventData(profile_event_id, &event_data); + Qnn_ErrorHandle_t result = qnn_interface_.profileGetEventData(profile_event_id, &event_data); QnnProfile_Error_t errorCode = static_cast(result & 0xFFFF); ORT_RETURN_IF(QNN_PROFILE_NO_ERROR != result, "Failed to get profile event data: " + std::string(QnnProfileErrorToString(errorCode))); @@ -1293,6 +1293,15 @@ const char* QnnBackendManager::QnnProfileErrorToString(QnnProfile_Error_t error) } } +const char* QnnBackendManager::QnnErrorHandleToString(Qnn_ErrorHandle_t error) { + // From QNN SDK: The memory is statically owned and should not be freed by the caller. + const char* error_msg = nullptr; + if (QNN_SUCCESS == qnn_interface_.errorGetMessage(error, &error_msg)) { + return error_msg; + } + return "Unknown"; +} + const std::string QnnBackendManager::ExtractQnnScalarValue(const Qnn_Scalar_t& scalar) { switch (scalar.dataType) { case QNN_DATATYPE_INT_8: diff --git a/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.h b/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.h index d51e547aeb2fb..a4811b2cb6db3 100644 --- a/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.h +++ b/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.h @@ -216,6 +216,7 @@ class QnnBackendManager { static const std::string GetEventTypeString(QnnProfile_EventType_t eventType); static const std::string ExtractQnnScalarValue(const Qnn_Scalar_t& scalar); const char* QnnProfileErrorToString(QnnProfile_Error_t error); + const char* QnnErrorHandleToString(Qnn_ErrorHandle_t error); QnnLog_Level_t MapOrtSeverityToQNNLogLevel(logging::Severity ort_log_level); #ifdef _WIN32 void LogQnnProfileEventAsTraceLogging( From 2580d935cbecd756cef435fb173a2f10237e9d2a Mon Sep 17 00:00:00 2001 From: Scott McKay Date: Wed, 24 Jul 2024 16:08:20 +1000 Subject: [PATCH 08/57] CoreML: Add ML Program ConvTranspose (#21416) ### Description Add ML Program ConvTranspose - some limitations to simplify the implementation for now - some limitations due to flaky CoreML output Added support for non-contiguous MLMultiArray output as we see that with some unit tests when the CPU-only flag is not set (e.g. innermost dim has min size of 16 but test output only has 8 values). - support only one non-contiguous dim to keep it simple - manually tested as we don't have a setup that can test objective-c code - test code is in model.mm and can be enabled via ifdef if we need to validate any future changes ### Motivation and Context Address operator gaps in high priority model. --------- Co-authored-by: Edward Chen <18449977+edgchen1@users.noreply.github.com> --- cmake/onnxruntime_unittests.cmake | 5 +- .../builders/impl/convtranspose_op_builder.cc | 218 ++++++++++++++++++ .../coreml/builders/impl/resize_op_builder.cc | 4 +- .../coreml/builders/op_builder_factory.cc | 164 ++++--------- .../coreml/builders/op_builder_factory.h | 1 + .../core/providers/coreml/model/model.h | 13 ++ .../core/providers/coreml/model/model.mm | 140 ++++++++--- .../builders/impl/resize_op_builder.cc | 4 +- onnxruntime/core/providers/utils.cc | 2 +- onnxruntime/core/providers/utils.h | 2 +- .../providers/xnnpack/nn/conv_transpose.cc | 2 +- .../core/providers/xnnpack/tensor/resize.cc | 4 +- .../test/providers/coreml/utils_test.mm | 108 +++++++++ .../cpu/nn/conv_transpose_op_test.cc | 8 +- .../apple/coreml_supported_mlprogram_ops.md | 1 + 15 files changed, 511 insertions(+), 165 deletions(-) create mode 100644 onnxruntime/core/providers/coreml/builders/impl/convtranspose_op_builder.cc create mode 100644 onnxruntime/test/providers/coreml/utils_test.mm diff --git a/cmake/onnxruntime_unittests.cmake b/cmake/onnxruntime_unittests.cmake index 38ed0b1640192..0c1e5e93c6844 100644 --- a/cmake/onnxruntime_unittests.cmake +++ b/cmake/onnxruntime_unittests.cmake @@ -679,7 +679,10 @@ if(onnxruntime_USE_RKNPU) endif() if(onnxruntime_USE_COREML) - list(APPEND onnxruntime_test_framework_src_patterns ${TEST_SRC_DIR}/providers/coreml/*) + list(APPEND onnxruntime_test_framework_src_patterns ${TEST_SRC_DIR}/providers/coreml/*.cc) + if(APPLE) + list(APPEND onnxruntime_test_framework_src_patterns ${TEST_SRC_DIR}/providers/coreml/*.mm) + endif() list(APPEND onnxruntime_test_framework_libs onnxruntime_providers_coreml coreml_proto) list(APPEND onnxruntime_test_providers_dependencies onnxruntime_providers_coreml coreml_proto) list(APPEND onnxruntime_test_providers_libs onnxruntime_providers_coreml coreml_proto) diff --git a/onnxruntime/core/providers/coreml/builders/impl/convtranspose_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/convtranspose_op_builder.cc new file mode 100644 index 0000000000000..5b6d9d72ab3c9 --- /dev/null +++ b/onnxruntime/core/providers/coreml/builders/impl/convtranspose_op_builder.cc @@ -0,0 +1,218 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include "core/providers/common.h" +#include "core/providers/coreml/builders/helper.h" +#include "core/providers/coreml/builders/impl/base_op_builder.h" +#include "core/providers/coreml/builders/impl/builder_utils.h" +#include "core/providers/coreml/builders/model_builder.h" +#include "core/providers/coreml/builders/op_builder_factory.h" +#include "core/providers/coreml/shape_utils.h" +#include "core/providers/shared/utils/utils.h" + +using namespace CoreML::Specification; + +namespace onnxruntime { +namespace coreml { + +class ConvTransposeOpBuilder : public BaseOpBuilder { + Status AddToModelBuilderImpl(ModelBuilder& model_builder, const Node& node, + const logging::Logger& logger) const override; + + bool IsOpSupportedImpl(const Node& /* node */, const OpBuilderInputParams& /* input_params */, + const logging::Logger& /* logger */) const override; + + bool SupportsMLProgram() const override { return true; } +}; + +Status ConvTransposeOpBuilder::AddToModelBuilderImpl([[maybe_unused]] ModelBuilder& model_builder, + [[maybe_unused]] const Node& node, + const logging::Logger& /*logger*/) const { +#if defined(COREML_ENABLE_MLPROGRAM) + using namespace CoreML::Specification::MILSpec; // NOLINT + const auto input_defs = node.InputDefs(); + const auto output_defs = node.OutputDefs(); + const auto& input_name = input_defs[0]->Name(); + + NodeAttrHelper helper(node); + + // https://apple.github.io/coremltools/source/coremltools.converters.mil.mil.ops.defs.html#coremltools.converters.mil.mil.ops.defs.iOS15.conv.conv_transpose + std::unique_ptr op = model_builder.CreateOperation(node, "conv_transpose"); + const auto& op_type = op->type(); + + AddOperationInput(*op, "x", input_name); + AddOperationInput(*op, "weight", input_defs[1]->Name()); + + if (input_defs.size() > 2) { + AddOperationInput(*op, "bias", input_defs[2]->Name()); + } + + // we know this input has a valid shape due to the check in IsOpSupportedImpl. ignore N and C dims. + const auto num_spatial_dims = input_defs[1]->Shape()->dim_size() - 2; + + // Spec says strides/dilations/pads are optional but reality is they're required for at least the iOS15 target + // which is CoreML5. Due to that we just add everything for simplicity. + const auto strides = helper.Get("strides", std::vector(num_spatial_dims, 1)); + const auto dilations = helper.Get("dilations", std::vector(num_spatial_dims, 1)); + + AddOperationInput(*op, "strides", model_builder.AddConstant(op_type, "strides", strides)); + AddOperationInput(*op, "dilations", model_builder.AddConstant(op_type, "dilations", dilations)); + + const std::optional groups = helper.GetInt64("group"); + if (groups) { + AddOperationInput(*op, "groups", model_builder.AddScalarConstant(op_type, "groups", *groups)); + } + + // if we can enable output_shape, this code works. see IsOpSupportedImpl for the reason it's disabled. + // const auto output_shape = helper.GetInt64s("output_shape"); + // if (output_shape) { + // AddOperationInput(*op, "output_shape", model_builder.AddConstant(op_type, "output_shape", *output_shape)); + // // these are required despite the spec saying otherwise + // AddOperationInput(*op, "pad_type", model_builder.AddScalarConstant(op_type, "pad_type", std::string("valid"))); + // std::vector pads(num_spatial_dims * 2, 0); + // AddOperationInput(*op, "pad", model_builder.AddConstant(op_type, "pad", pads)); + //} else { + // AddPadTypeAndPads(*op, model_builder, op_type, helper, num_spatial_dims); + //} + + AddPadTypeAndPads(*op, model_builder, op_type, helper, num_spatial_dims); + + AddOperationOutput(*op, *output_defs[0]); + + model_builder.AddOperation(std::move(op)); +#endif // defined(COREML_ENABLE_MLPROGRAM) + + return Status::OK(); +} + +bool ConvTransposeOpBuilder::IsOpSupportedImpl(const Node& node, const OpBuilderInputParams& input_params, + const logging::Logger& logger) const { + if (!input_params.create_mlprogram) { + LOGS(logger, VERBOSE) << "ConvTranspose: ML Program required"; + return false; + } + + // ML Program + // - const weight until CoreML7 (iOS17) + // - require constant for now as non-const would be unusual and we rely on the shape of W to be known to validate + // the kernel_shape can be used + // - const bias + // - const pad + // - if auto_pad is same_upper or same_lower the output[i] - (input[i] * strides[i]) must be divisible by 2 + // as the pads must be equally split as there's no upper/lower option in CoreML + // - punting on supporting this for now + // - must be symmetric for CoreML to do the right thing + // - const strides/dilations/groups + // - output_shape CoreML output is inconsistent so disabled for now + // + // NOTE: need to test with/without the COREML_FLAG_USE_CPU_ONLY flag being set to get an idea of how flaky the CoreML + // behavior is. + // Update /onnxruntime/test/util/default_providers.cc:DefaultCoreMLExecutionProvider to do so + + const auto& input_defs = node.InputDefs(); + + std::vector input_shape; + if (!GetShape(*input_defs[0], input_shape, logger)) { + // requires the rank at least to be known + LOGS(logger, VERBOSE) << "ConvTranspose: failed to get input shape"; + return false; + } + + // for simplicity require weight to be constant + const auto& weight_arg = *input_defs[1]; + const auto& weight_name = input_defs[1]->Name(); + const auto* weight = input_params.graph_viewer.GetConstantInitializer(weight_name); + if (!weight) { + LOGS(logger, VERBOSE) << "ConvTranspose: weight must be constant"; + return false; + } + + if (input_defs.size() > 2 && !input_params.graph_viewer.GetConstantInitializer(input_defs[2]->Name())) { + LOGS(logger, VERBOSE) << "ConvTranspose: bias must be constant"; + return false; + } + + std::vector weight_shape; + if (!GetShape(weight_arg, weight_shape, logger)) { + // impossible as it's a constant initializer + LOGS(logger, VERBOSE) << "ConvTranspose: failed to get weight shape"; + return false; + } + + int64_t num_spatial_dims = narrow(weight_shape.size()) - 2; + + NodeAttrHelper helper(node); + + // Punt on SAME_UPPER/SAME_LOWER for now. + // We could infer that 'same' -> 'same_upper' based on the CoreML conv spec having 'same' and 'same_lower' but + // need to validate that assertion. + // Additionally, if the pads size is equal, there's no difference between same_upper and same_lower. + // To do that we'd need the 'output_shape' attribute to check against. + // Can add this handling if/when needed. + auto autopad = StringToAutoPadType(helper.Get("auto_pad", "NOTSET")); + if (autopad == AutoPadType::SAME_LOWER || autopad == AutoPadType::SAME_UPPER) { + LOGS(logger, VERBOSE) << "ConvTranspose: support for SAME_LOWER/SAME_UPPER is not implemented yet"; + return false; + } else if (autopad == AutoPadType::NOTSET) { + // CoreML output is inconsistent between CPU_ONLY and ALL if the pads aren't all the same value. + // CPU matches the expected output, but other devices don't seem to (at least on macOS). + auto onnx_pads = *helper.GetInt64s("pads"); // 'pads' are required if auto_pad is NOTSET + const auto pad_value = onnx_pads[0]; + if (!std::all_of(onnx_pads.begin() + 1, onnx_pads.end(), + [pad_value](auto value) { return value == pad_value; })) { + LOGS(logger, VERBOSE) << "ConvTranspose: all pad values must be the same for CoreML to return " + "consistent results"; + return false; + } + } + + // there's no input to specify a kernel shape in CoreML. + // it's OK if a specified kernel_shape matches kH and kW dims of the weight input. + auto kernel_shape = helper.GetInt64s("kernel_shape"); + if (kernel_shape) { + bool valid = true; + + if (static_cast(kernel_shape->size()) == num_spatial_dims) { + for (int i = 0; i < num_spatial_dims; ++i) { + // check the specified kernel shape matches the weight shape. skip the initial N and C dims in the latter. + if ((*kernel_shape)[i] != weight_shape[i + 2]) { + valid = false; + break; + } + } + } else { + valid = false; + } + + if (!valid) { + LOGS(logger, VERBOSE) << "ConvTranspose: kernel_shape attribute does not match the weight shape"; + return false; + } + } + + // In theory this can be supported, but running with COREML_FLAG_USE_CPU_ONLY produces output that doesn't match + // ONNX. Running without that flag produces the expected output. Madness... + auto output_shape = helper.GetInt64s("output_shape"); + if (output_shape) { + LOGS(logger, VERBOSE) << "ConvTranspose: output_shape is not supported as the CoreML output is inconsistent"; + return false; + } + + // output_padding, if specified, must be the default value of all zeros as there's no equivalent in CoreML. + auto output_padding = helper.GetInt64s("output_padding"); + if (output_padding && + std::any_of(output_padding->begin(), output_padding->end(), [](auto value) { return value != 0; })) { + LOGS(logger, VERBOSE) << "ConvTranspose: output_padding is not supported"; + return false; + } + + return true; +} + +void CreateConvTransposeOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations) { + op_registrations.builders.push_back(std::make_unique()); + op_registrations.op_builder_map.emplace(op_type, op_registrations.builders.back().get()); +} + +} // namespace coreml +} // namespace onnxruntime diff --git a/onnxruntime/core/providers/coreml/builders/impl/resize_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/resize_op_builder.cc index 65b5c17f2c6a6..7ff66e4a79e37 100644 --- a/onnxruntime/core/providers/coreml/builders/impl/resize_op_builder.cc +++ b/onnxruntime/core/providers/coreml/builders/impl/resize_op_builder.cc @@ -427,13 +427,13 @@ bool ResizeOpBuilder::IsOpSupportedImpl(const Node& node, const OpBuilderInputPa auto h_in = input_shape[input_rank - 2]; auto w_in = input_shape[input_rank - 1]; - if (!utils::IsScalingByAFactorOfN(h_in, scale_h)) { + if (!utils::ReciprocalIsAFactorOfN(h_in, scale_h)) { LOGS(logger, VERBOSE) << "Resize: downsampling scale " << scale_h << " is not a factor of input height: " << h_in; return false; } - if (!utils::IsScalingByAFactorOfN(w_in, scale_w)) { + if (!utils::ReciprocalIsAFactorOfN(w_in, scale_w)) { LOGS(logger, VERBOSE) << "Resize: downsampling scale " << scale_w << " is not a factor of input width: " << w_in; return false; diff --git a/onnxruntime/core/providers/coreml/builders/op_builder_factory.cc b/onnxruntime/core/providers/coreml/builders/op_builder_factory.cc index b17827f8e0532..535712f096010 100644 --- a/onnxruntime/core/providers/coreml/builders/op_builder_factory.cc +++ b/onnxruntime/core/providers/coreml/builders/op_builder_factory.cc @@ -15,120 +15,56 @@ namespace coreml { static OpBuilderRegistrations CreateOpBuilderRegistrations() { OpBuilderRegistrations op_registrations; - { // Add/Mul/Pow/Sub/Div - CreateBinaryOpBuilder("Add", op_registrations); - CreateBinaryOpBuilder("Mul", op_registrations); - CreateBinaryOpBuilder("Pow", op_registrations); - CreateBinaryOpBuilder("Sub", op_registrations); - CreateBinaryOpBuilder("Div", op_registrations); - } - - { // Activations - CreateActivationOpBuilder("Sigmoid", op_registrations); - CreateActivationOpBuilder("Tanh", op_registrations); - CreateActivationOpBuilder("Relu", op_registrations); - CreateActivationOpBuilder("PRelu", op_registrations); - CreateActivationOpBuilder("LeakyRelu", op_registrations); - } - - { // Transpose - CreateTransposeOpBuilder("Transpose", op_registrations); - } - - { // Conv - CreateConvOpBuilder("Conv", op_registrations); - } - - { // Batch Normalization - CreateBatchNormalizationOpBuilder("BatchNormalization", op_registrations); - } - - { // Reshape - CreateReshapeOpBuilder("Reshape", op_registrations); - } - - { // DepthToSpace - CreateDepthToSpaceOpBuilder("DepthToSpace", op_registrations); - } - - { // Pool - CreatePoolOpBuilder("GlobalAveragePool", op_registrations); - CreatePoolOpBuilder("GlobalMaxPool", op_registrations); - CreatePoolOpBuilder("AveragePool", op_registrations); - CreatePoolOpBuilder("MaxPool", op_registrations); - } - - { // Concat - CreateConcatOpBuilder("Concat", op_registrations); - } - - { // Resize - CreateResizeOpBuilder("Resize", op_registrations); - } - - { // Gemm/MatMul - CreateGemmOpBuilder("Gemm", op_registrations); - CreateGemmOpBuilder("MatMul", op_registrations); - } - - { // Clip - CreateClipOpBuilder("Clip", op_registrations); - } - - { // Squeeze - CreateSqueezeOpBuilder("Squeeze", op_registrations); - } - - { // ArgMax - CreateArgMaxOpBuilder("ArgMax", op_registrations); - } - - { // Cast - CreateCastOpBuilder("Cast", op_registrations); - } - - { // Flatten - CreateFlattenOpBuilder("Flatten", op_registrations); - } - - { // LRN - CreateLRNOpBuilder("LRN", op_registrations); - } - - { // Pad - CreatePadOpBuilder("Pad", op_registrations); - } - - { // Unary - CreateUnaryOpBuilder("Sqrt", op_registrations); - CreateUnaryOpBuilder("Reciprocal", op_registrations); - } - - { // Reduction - // ReduceMean is used in layer normalization which seems to be problematic in Python tests. - CreateReductionOpBuilder("ReduceMean", op_registrations); - CreateReductionOpBuilder("ReduceSum", op_registrations); - } - - { // Shape - CreateShapeOpBuilder("Shape", op_registrations); - } - - { // Gather - CreateGatherOpBuilder("Gather", op_registrations); - } - - { // Slice - CreateSliceOpBuilder("Slice", op_registrations); - } - - { // Softmax - CreateSoftmaxOpBuilder("Softmax", op_registrations); - } - - { // Split - CreateSplitOpBuilder("Split", op_registrations); - } + // Unary ops + CreateUnaryOpBuilder("Sqrt", op_registrations); + CreateUnaryOpBuilder("Reciprocal", op_registrations); + + // Binary elementwise ops + CreateBinaryOpBuilder("Add", op_registrations); + CreateBinaryOpBuilder("Mul", op_registrations); + CreateBinaryOpBuilder("Pow", op_registrations); + CreateBinaryOpBuilder("Sub", op_registrations); + CreateBinaryOpBuilder("Div", op_registrations); + + // Activations + CreateActivationOpBuilder("Sigmoid", op_registrations); + CreateActivationOpBuilder("Tanh", op_registrations); + CreateActivationOpBuilder("Relu", op_registrations); + CreateActivationOpBuilder("PRelu", op_registrations); + CreateActivationOpBuilder("LeakyRelu", op_registrations); + + // Pooling ops + CreatePoolOpBuilder("GlobalAveragePool", op_registrations); + CreatePoolOpBuilder("GlobalMaxPool", op_registrations); + CreatePoolOpBuilder("AveragePool", op_registrations); + CreatePoolOpBuilder("MaxPool", op_registrations); + + // Reduction ops + CreateReductionOpBuilder("ReduceMean", op_registrations); + CreateReductionOpBuilder("ReduceSum", op_registrations); + + CreateArgMaxOpBuilder("ArgMax", op_registrations); + CreateBatchNormalizationOpBuilder("BatchNormalization", op_registrations); + CreateCastOpBuilder("Cast", op_registrations); + CreateClipOpBuilder("Clip", op_registrations); + CreateConcatOpBuilder("Concat", op_registrations); + CreateConvOpBuilder("Conv", op_registrations); + CreateConvTransposeOpBuilder("ConvTranspose", op_registrations); + CreateDepthToSpaceOpBuilder("DepthToSpace", op_registrations); + CreateFlattenOpBuilder("Flatten", op_registrations); + CreateGatherOpBuilder("Gather", op_registrations); + CreateGemmOpBuilder("Gemm", op_registrations); + CreateLRNOpBuilder("LRN", op_registrations); + CreateGemmOpBuilder("MatMul", op_registrations); + CreatePadOpBuilder("Pad", op_registrations); + CreateReshapeOpBuilder("Reshape", op_registrations); + CreateResizeOpBuilder("Resize", op_registrations); + CreateShapeOpBuilder("Shape", op_registrations); + CreateSliceOpBuilder("Slice", op_registrations); + CreateSplitOpBuilder("Split", op_registrations); + CreateSoftmaxOpBuilder("Softmax", op_registrations); + CreateSqueezeOpBuilder("Squeeze", op_registrations); + CreateTransposeOpBuilder("Transpose", op_registrations); CreateGridSampleOpBuilder("GridSample", op_registrations); diff --git a/onnxruntime/core/providers/coreml/builders/op_builder_factory.h b/onnxruntime/core/providers/coreml/builders/op_builder_factory.h index a9a8ab90b0863..1990fb6400ce1 100644 --- a/onnxruntime/core/providers/coreml/builders/op_builder_factory.h +++ b/onnxruntime/core/providers/coreml/builders/op_builder_factory.h @@ -24,6 +24,7 @@ void CreateCastOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_ void CreateClipOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations); void CreateConcatOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations); void CreateConvOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations); +void CreateConvTransposeOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations); void CreateDepthToSpaceOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations); void CreateFlattenOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations); void CreateGatherOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations); diff --git a/onnxruntime/core/providers/coreml/model/model.h b/onnxruntime/core/providers/coreml/model/model.h index c4c3b38bba516..75b9aaf2185c9 100644 --- a/onnxruntime/core/providers/coreml/model/model.h +++ b/onnxruntime/core/providers/coreml/model/model.h @@ -13,6 +13,10 @@ #include "core/common/status.h" #include "core/platform/ort_mutex.h" +#if defined(__OBJC__) +@class MLMultiArray; +#endif + namespace onnxruntime { namespace coreml { @@ -32,6 +36,15 @@ using GetOutputTensorMutableRawDataFn = std::function static_shape)>; +#if defined(__OBJC__) +// helper function that we unit test. +// Handles an MLMultiArray that is contiguous, or has one non-contiguous dimension. +// The output values can be used to copy the array data to a contiguous buffer. +// Loop num_blocks times, copying block_size elements each time, moving stride elements between copies. +// A contiguous array will have num_blocks == 1, block_size == total_size (i.e. can be copied in a single operation) +Status GetMLMultiArrayCopyInfo(const MLMultiArray* array, int64_t& num_blocks, int64_t& block_size, int64_t& stride); +#endif + class Model { public: Model(const std::string& path, diff --git a/onnxruntime/core/providers/coreml/model/model.mm b/onnxruntime/core/providers/coreml/model/model.mm index 1d506099b4367..4fd822f0d0d15 100644 --- a/onnxruntime/core/providers/coreml/model/model.mm +++ b/onnxruntime/core/providers/coreml/model/model.mm @@ -174,51 +174,69 @@ Status CreateInputFeatureProvider(const std::unordered_map mlmultiarray_buffer_size) { + const MLMultiArray* array, + const int64_t num_blocks, const int64_t block_size, const int64_t stride, + const OnnxTensorInfo* tensor_info) { if (mlmultiarray_buffer == nullptr) { return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "mlmultiarray_buffer has no data"); } - const size_t num_elements = array_info.count; + // total including non-contiguous space + + int64_t array_total_elements = [array.strides[0] longLongValue] * [array.shape[0] longLongValue]; + const int64_t num_elements = array.count; + + ORT_RETURN_IF(array_total_elements != num_blocks * stride || + num_elements != num_blocks * block_size, + "MLMultiArray size does not match the copy info"); + const auto onnx_data_type = tensor_info->data_type; switch (onnx_data_type) { case ONNX_NAMESPACE::TensorProto_DataType_FLOAT: { - const auto output_data_byte_size = num_elements * sizeof(float); - ORT_RETURN_IF_NOT(!mlmultiarray_buffer_size || mlmultiarray_buffer_size == output_data_byte_size, - "CoreML output buffer size and expected output size differ"); - memcpy(tensor_buffer, mlmultiarray_buffer, output_data_byte_size); + const auto* src_buffer = static_cast(mlmultiarray_buffer); + auto* dst_buffer = static_cast(tensor_buffer); + const auto block_byte_size = block_size * sizeof(float); + + for (int64_t idx = 0; idx < num_blocks; ++idx) { + memcpy(dst_buffer, src_buffer, block_byte_size); + src_buffer += stride; + dst_buffer += block_size; + } break; } case ONNX_NAMESPACE::TensorProto_DataType_INT32: { - const auto output_data_byte_size = num_elements * sizeof(int32_t); - ORT_RETURN_IF_NOT(!mlmultiarray_buffer_size || mlmultiarray_buffer_size == output_data_byte_size, - "CoreML output buffer size and expected output size differ"); - memcpy(tensor_buffer, mlmultiarray_buffer, output_data_byte_size); + const auto* src_buffer = static_cast(mlmultiarray_buffer); + auto* dst_buffer = static_cast(tensor_buffer); + const auto block_byte_size = block_size * sizeof(int32_t); + + for (int64_t idx = 0; idx < num_blocks; ++idx) { + memcpy(dst_buffer, src_buffer, block_byte_size); + src_buffer += stride; + dst_buffer += block_size; + } + break; } // For this case, since Coreml Spec only uses int32 for model output while onnx provides // int64 for model output data type. We are doing a type casting (int32 -> int64) here // when copying the model to ORT case ONNX_NAMESPACE::TensorProto_DataType_INT64: { - ORT_RETURN_IF_NOT(array_info.dataType == MLMultiArrayDataTypeInt32, - "CoreML output data type is not MLMultiArrayDataTypeInt32"); - ORT_RETURN_IF_NOT(!mlmultiarray_buffer_size || mlmultiarray_buffer_size == num_elements * sizeof(int32_t), - "CoreML output buffer size and expected output size differ"); - const auto model_output_span = gsl::span{static_cast(mlmultiarray_buffer), num_elements}; - const auto output_span = gsl::span{static_cast(tensor_buffer), num_elements}; - std::transform(model_output_span.begin(), model_output_span.end(), output_span.begin(), - [](int32_t v) { return static_cast(v); }); + ORT_RETURN_IF(array.dataType != MLMultiArrayDataTypeInt32, + "CoreML output data type is not MLMultiArrayDataTypeInt32"); + + const int32_t* src_buffer = static_cast(mlmultiarray_buffer); + int64_t* dst_buffer = static_cast(tensor_buffer); + + for (int64_t idx = 0; idx < num_blocks; ++idx) { + auto input_span = gsl::span{src_buffer, static_cast(block_size)}; + auto output_span = gsl::span{dst_buffer, static_cast(block_size)}; + std::transform(input_span.begin(), input_span.end(), output_span.begin(), + [](int32_t v) { return static_cast(v); }); + + src_buffer += stride; + dst_buffer += block_size; + } break; } default: @@ -250,8 +268,7 @@ - (void)dealloc; - (Status)loadModel API_AVAILABLE_COREML3; - (Status)predict:(const std::unordered_map&)inputs outputs:(const std::unordered_map&)outputs - getOutputTensorDataFn:(const GetOutputTensorMutableRawDataFn&) - get_output_tensor_mutable_raw_data_fn + getOutputTensorDataFn:(const GetOutputTensorMutableRawDataFn&)get_output_tensor_mutable_raw_data_fn API_AVAILABLE_COREML3; @property(nullable) MLModel* model API_AVAILABLE_COREML3; @@ -397,21 +414,27 @@ - (Status)predict:(const std::unordered_map&)inputs ") do not match"); } - ORT_RETURN_IF_NOT(IsArrayContiguous(data), - "Non-contiguous output MLMultiArray is not currently supported"); + // support a non-contiguous array, provided only one dimension is not contiguous + int64_t num_blocks = 0; + int64_t block_size = 0; + int64_t stride = 0; + + ORT_RETURN_IF_ERROR(GetMLMultiArrayCopyInfo(data, num_blocks, block_size, stride)); + __block Status copy_status; const auto* tensor_info = &output_tensor_info; // `getBytesWithHandler` replaces deprecated `.dataPointer` on new versions if (@available(macOS 12.3, iOS 15.4, *)) { [data getBytesWithHandler:^(const void* bytes, NSInteger size) { - copy_status = CopyMLMultiArrayBuffer(bytes, output_buffer, data, tensor_info, size); + copy_status = CopyMLMultiArrayBuffer(bytes, output_buffer, data, + num_blocks, block_size, stride, tensor_info); }]; } else { - // disable size check as old API does not return buffer length - copy_status = CopyMLMultiArrayBuffer(data.dataPointer, output_buffer, data, tensor_info, std::nullopt); + copy_status = CopyMLMultiArrayBuffer(data.dataPointer, output_buffer, data, + num_blocks, block_size, stride, tensor_info); } - if (!copy_status.IsOK()) - return copy_status; + + ORT_RETURN_IF_ERROR(copy_status); } } } @@ -431,6 +454,49 @@ - (Status)predict:(const std::unordered_map&)inputs namespace onnxruntime { namespace coreml { +Status GetMLMultiArrayCopyInfo(const MLMultiArray* _Nonnull array, + int64_t& num_blocks, int64_t& block_size, int64_t& stride) { + const auto* shape = array.shape; + const auto rank = shape.count; + + int64_t array_total_elements = [array.strides[0] longLongValue] * [shape[0] longLongValue]; + + int64_t data_elems = 1; // actual values + int64_t total_elems = 1; // elems including empty slots if non-contiguous + for (unsigned long i = 1; i <= rank; i++) { + int64_t this_stride = [array.strides[rank - i] longLongValue]; + if (this_stride != total_elems) { + // non-contiguous + if (block_size != 0) { + return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, + "Multiple non-contiguous dimensions in MLMultiArray are not supported."); + } + + block_size = data_elems; + stride = this_stride; + } + + const auto elems_this_dim = [shape[rank - i] longLongValue]; + data_elems *= elems_this_dim; + total_elems = elems_this_dim * this_stride; + } + + if (block_size == 0) { + // all data is contiguous + block_size = data_elems; + stride = array_total_elements; + assert(block_size == stride); + } + + num_blocks = data_elems / block_size; + + ORT_ENFORCE(array_total_elements == total_elems, "Logic error calculating copy info"); + ORT_ENFORCE(stride >= block_size, "Logic error calculating copy info"); + ORT_ENFORCE(stride * num_blocks == total_elems, "Logic error calculating copy info"); + + return Status::OK(); +} + // Internal Execution class // This class will bridge Model (c++) with CoreMLExecution (objective c++) class Execution { diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/resize_op_builder.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/resize_op_builder.cc index ef27f6c942f44..44403010c936c 100644 --- a/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/resize_op_builder.cc +++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/builders/impl/resize_op_builder.cc @@ -274,8 +274,8 @@ bool ResizeOpBuilder::IsOpSupportedImpl(const GraphViewer& graph_viewer, const N return false; } - if (!utils::IsScalingByAFactorOfN(h_in, scale_h) || - !utils::IsScalingByAFactorOfN(w_in, scale_w)) { + if (!utils::ReciprocalIsAFactorOfN(h_in, scale_h) || + !utils::ReciprocalIsAFactorOfN(w_in, scale_w)) { LOGS_DEFAULT(VERBOSE) << "Input size must be evenly divisible by output size when downsampling"; return false; } diff --git a/onnxruntime/core/providers/utils.cc b/onnxruntime/core/providers/utils.cc index 747b09e42aa21..2725af95e0959 100644 --- a/onnxruntime/core/providers/utils.cc +++ b/onnxruntime/core/providers/utils.cc @@ -24,7 +24,7 @@ common::Status OutputOptionalWithoutDataHelper(const ONNX_NAMESPACE::TypeProto& } #endif -bool IsScalingByAFactorOfN(int64_t n, float scale) { +bool ReciprocalIsAFactorOfN(int64_t n, float scale) { bool is_factor = false; if (scale > 0.f && scale < 1.f) { const double factor = 1.0 / scale; diff --git a/onnxruntime/core/providers/utils.h b/onnxruntime/core/providers/utils.h index 9ea8496a02f85..cfd71d9b838b3 100644 --- a/onnxruntime/core/providers/utils.h +++ b/onnxruntime/core/providers/utils.h @@ -19,6 +19,6 @@ common::Status OutputOptionalWithoutDataHelper(const ONNX_NAMESPACE::TypeProto& /// Check if the reciprocal of 'scale' is a factor of 'n'. /// e.g. a scale of 0.5 is 1/2, the reciprocal is 2, and 2 is a factor of any even number. /// -bool IsScalingByAFactorOfN(int64_t n, float scale); +bool ReciprocalIsAFactorOfN(int64_t n, float scale); } // namespace utils } // namespace onnxruntime diff --git a/onnxruntime/core/providers/xnnpack/nn/conv_transpose.cc b/onnxruntime/core/providers/xnnpack/nn/conv_transpose.cc index c136385f12476..01c8119fea79d 100644 --- a/onnxruntime/core/providers/xnnpack/nn/conv_transpose.cc +++ b/onnxruntime/core/providers/xnnpack/nn/conv_transpose.cc @@ -24,7 +24,7 @@ Status ConvTranspose::PrePack(const Tensor& tensor, int input_idx, AllocatorPtr const auto rank = orig_shape.NumDimensions(); if (conv_transpose_attrs_.group > 1) { - // Xnnpack [G, Oc, H, W Ic/G] + // Xnnpack [G, Oc, H, W, Ic/G] // (ref: https://github.com/google/XNNPACK/blob/ecd8311c8fd3d9ab47edbc3df5f2b5de7dabe75f/test/deconvolution-operator-tester.h#L678) if (rank == 4) { // split C (dim 0) into {group, C/group} diff --git a/onnxruntime/core/providers/xnnpack/tensor/resize.cc b/onnxruntime/core/providers/xnnpack/tensor/resize.cc index c752b5f849808..cf874796ba169 100644 --- a/onnxruntime/core/providers/xnnpack/tensor/resize.cc +++ b/onnxruntime/core/providers/xnnpack/tensor/resize.cc @@ -85,8 +85,8 @@ bool Resize::IsOnnxNodeSupported(const NodeUnit& node_unit, float scale_h = scales[2]; float scale_w = scales[3]; - if (!utils::IsScalingByAFactorOfN(h_in, scale_h) || - !utils::IsScalingByAFactorOfN(w_in, scale_w)) { + if (!utils::ReciprocalIsAFactorOfN(h_in, scale_h) || + !utils::ReciprocalIsAFactorOfN(w_in, scale_w)) { break; } } diff --git a/onnxruntime/test/providers/coreml/utils_test.mm b/onnxruntime/test/providers/coreml/utils_test.mm new file mode 100644 index 0000000000000..f55f108494e3e --- /dev/null +++ b/onnxruntime/test/providers/coreml/utils_test.mm @@ -0,0 +1,108 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#import + +#include "gtest/gtest.h" +#include "gmock/gmock.h" + +#include "core/providers/coreml/model/model.h" +#include "test/util/include/asserts.h" + +namespace onnxruntime { +namespace test { +namespace { +auto ValidateGetInfo(MLMultiArray* array, + int64_t expected_num_blocks, int64_t expected_block_size, int64_t expected_stride, + bool expect_valid) { + int64_t num_blocks = 0; + int64_t block_size = 0; + int64_t stride = 0; + auto status = coreml::GetMLMultiArrayCopyInfo(array, num_blocks, block_size, stride); + + if (!expect_valid) { + ASSERT_STATUS_NOT_OK(status); + return; + } + + ASSERT_STATUS_OK(status); + ASSERT_EQ(num_blocks, expected_num_blocks); + ASSERT_EQ(block_size, expected_block_size); + ASSERT_EQ(stride, expected_stride); +} +} // namespace + +TEST(CoreMLUtils, GetMLMultiArrayReadInfo) { + // fake pointer. we don't read any data but initWithDataPointer requires a non-null address + void* data = reinterpret_cast(0xfeedf00d); + + // a dim is non-contiguous if the stride is > the total number of elements in its inner dimensions + + // dim -1 with non-contiguous data. 1 element (as it's the inner-most dimension) but the stride is 2. + { + NSArray* shape = @[ @1, @1, @8, @8 ]; + NSArray* strides = @[ @128, @128, @16, @2 ]; + + auto* array = [[MLMultiArray alloc] initWithDataPointer:data + shape:shape + dataType:MLMultiArrayDataTypeInt32 + strides:strides + deallocator:^(void* /* bytes */) { + } + error:nil]; + ValidateGetInfo(array, 64, 1, 2, true); + } + + // dim -2 with non-contiguous data. 8 elements in the inner dimension but the stride is 16. + { + NSArray* shape = @[ @1, @1, @8, @8 ]; + NSArray* strides = @[ @128, @128, @16, @1 ]; + + auto* array = [[MLMultiArray alloc] initWithDataPointer:data + shape:shape + dataType:MLMultiArrayDataTypeInt32 + strides:strides + deallocator:^(void* /* bytes */) { + } + error:nil]; + ValidateGetInfo(array, 8, 8, 16, true); + } + + // dim -3 with non-contiguous data. 16 elements in the inner dimensions but stride is 24. + { + NSArray* shape = @[ @1, @2, @4, @4 ]; + NSArray* strides = @[ @48, @24, @4, @1 ]; + + auto* array = [[MLMultiArray alloc] initWithDataPointer:data + shape:shape + dataType:MLMultiArrayDataTypeInt32 + strides:strides + deallocator:^(void* /* bytes */) { + } + error:nil]; + + ValidateGetInfo(array, 2, 16, 24, true); + } + + // two non-contiguous dims (dim -2 and dim -3) + // dim -2 has 4 elements in the inner dimension and stride of 8 + // dim -3 has 32 elements in the inner dimensions (we need to include the empty elements from the non-contiguous data + // in dim -2) and stride of 48 + { + // dim + NSArray* shape = @[ @1, @2, @4, @4 ]; + NSArray* strides = @[ @96, @48, @8, @1 ]; + + auto* array = [[MLMultiArray alloc] initWithDataPointer:data + shape:shape + dataType:MLMultiArrayDataTypeInt32 + strides:strides + deallocator:^(void* /* bytes */) { + } + error:nil]; + + ValidateGetInfo(array, 0, 0, 0, false); + } +} +} // namespace test +} // namespace onnxruntime diff --git a/onnxruntime/test/providers/cpu/nn/conv_transpose_op_test.cc b/onnxruntime/test/providers/cpu/nn/conv_transpose_op_test.cc index 81191e9b48c3c..2bf53ce5b5986 100644 --- a/onnxruntime/test/providers/cpu/nn/conv_transpose_op_test.cc +++ b/onnxruntime/test/providers/cpu/nn/conv_transpose_op_test.cc @@ -27,7 +27,7 @@ void TestConvTransposeOpInitializer(const ConvTransposeOpAttributes& attributes, const vector>& input_shapes, const std::initializer_list& expected_output, const vector& expected_output_shape, - bool is_filter_initializer = false, + bool is_weight_and_bias_initializer = false, OpTester::ExpectResult expect_result = OpTester::ExpectResult::kExpectSuccess, const std::string& err_str = "", const std::unordered_set& excluded_provider_types = {kTensorrtExecutionProvider}) { @@ -58,10 +58,10 @@ void TestConvTransposeOpInitializer(const ConvTransposeOpAttributes& attributes, } ORT_ENFORCE(inputs.size() <= 3, "Our name array is only setup to handle 3 inputs"); - const char* szNames[] = {"X", "W", "B"}; - bool isInitializers[] = {false, is_filter_initializer, false}; + const char* input_names[] = {"X", "W", "B"}; + bool is_initializers[] = {false, is_weight_and_bias_initializer, is_weight_and_bias_initializer}; for (size_t i = 0; i < inputs.size(); i++) { - test.AddInput(szNames[i], input_shapes[i], inputs[i], isInitializers[i]); + test.AddInput(input_names[i], input_shapes[i], inputs[i], is_initializers[i]); } test.AddOutput("Y", expected_output_shape, expected_output); diff --git a/tools/ci_build/github/apple/coreml_supported_mlprogram_ops.md b/tools/ci_build/github/apple/coreml_supported_mlprogram_ops.md index b65b0f64686a9..5609033fc3e35 100644 --- a/tools/ci_build/github/apple/coreml_supported_mlprogram_ops.md +++ b/tools/ci_build/github/apple/coreml_supported_mlprogram_ops.md @@ -7,6 +7,7 @@ Keep in sync with doco generated from /docs/execution-providers/CoreML-Execution |ai.onnx:AveragePool|Only 2D Pool is supported currently. 3D and 5D support can be added if needed.| |ai.onnx:Clip|| |ai.onnx:Conv|Only 1D/2D Conv is supported.
Bias if provided must be constant.| +|ai.onnx:ConvTranspose|Weight and bias must be constant.
padding_type of SAME_UPPER/SAME_LOWER is not supported.
kernel_shape must have default values.
output_shape is not supported.
output_padding must have default values.| |ai.onnx:Div|| |ai.onnx:Gemm|Input B must be constant.| |ai.onnx:GlobalAveragePool|Only 2D Pool is supported currently. 3D and 5D support can be added if needed.| From b04adcc3816b898f27242aaf0cae1d847c0dc988 Mon Sep 17 00:00:00 2001 From: Changming Sun Date: Wed, 24 Jul 2024 10:02:00 -0700 Subject: [PATCH 09/57] Update copy_strip_binary.sh: use "make install" instead (#21464) ### Description Before this change, copy_strip_binary.sh manually copies each file from onnx runtime's build folder to an artifact folder. It can be hard when dealing with symbolic link for shared libraries. This PR will change the packaging pipelines to run "make install" first, before packaging shared libs . ### Motivation and Context Recently because of feature request #21281 , we changed libonnxruntime.so's SONAME. Now every package that contains this shared library must also contains libonnxruntime.so.1. Therefore we need to change the packaging scripts to include this file. Instead of manually construct the symlink layout, using `make install` is much easier and will make things more consistent because it is a standard way of making packages. **Breaking change:** After this change, our **inference** tarballs that are published to our Github release pages will be not contain ORT **training** headers. --- cmake/onnxruntime.cmake | 1 + cmake/onnxruntime_framework.cmake | 4 +- .../core/optimizer/graph_transformer_utils.cc | 4 +- .../templates/c-api-linux-cpu.yml | 2 +- .../templates/final-jar-testing.yml | 3 +- .../templates/mac-cpu-packaging-steps.yml | 3 ++ .../github/linux/build_cuda_c_api_package.sh | 9 +--- .../github/linux/build_rocm_c_api_package.sh | 9 +--- .../linux/build_tensorrt_c_api_package.sh | 4 +- .../github/linux/copy_strip_binary.sh | 42 ++++--------------- 10 files changed, 25 insertions(+), 56 deletions(-) diff --git a/cmake/onnxruntime.cmake b/cmake/onnxruntime.cmake index 0e89c2f14d34b..bdb4b00b02a35 100644 --- a/cmake/onnxruntime.cmake +++ b/cmake/onnxruntime.cmake @@ -27,6 +27,7 @@ function(get_c_cxx_api_headers HEADERS_VAR) "${REPO_ROOT}/include/onnxruntime/core/session/onnxruntime_float16.h" "${REPO_ROOT}/include/onnxruntime/core/session/onnxruntime_run_options_config_keys.h" "${REPO_ROOT}/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h" + "${REPO_ROOT}/include/onnxruntime/core/session/onnxruntime_lite_custom_op.h" ) if (onnxruntime_ENABLE_TRAINING_APIS) diff --git a/cmake/onnxruntime_framework.cmake b/cmake/onnxruntime_framework.cmake index 43d16abd8fbae..b85edbf37d447 100644 --- a/cmake/onnxruntime_framework.cmake +++ b/cmake/onnxruntime_framework.cmake @@ -123,7 +123,9 @@ if (WIN32) target_compile_definitions(onnxruntime_framework PRIVATE _SCL_SECURE_NO_WARNINGS) endif() -if (NOT onnxruntime_BUILD_SHARED_LIB) +if (onnxruntime_BUILD_SHARED_LIB) + install(FILES ${PROJECT_SOURCE_DIR}/../include/onnxruntime/core/framework/provider_options.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/onnxruntime/) +else() install(DIRECTORY ${PROJECT_SOURCE_DIR}/../include/onnxruntime/core/framework DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/onnxruntime/core) install(TARGETS onnxruntime_framework ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} diff --git a/onnxruntime/core/optimizer/graph_transformer_utils.cc b/onnxruntime/core/optimizer/graph_transformer_utils.cc index 7da65f18ccacb..ab1dbaea7b7fd 100644 --- a/onnxruntime/core/optimizer/graph_transformer_utils.cc +++ b/onnxruntime/core/optimizer/graph_transformer_utils.cc @@ -189,7 +189,7 @@ InlinedVector> GenerateTransformers( const SessionOptions& session_options, const IExecutionProvider& cpu_execution_provider, /*required by constant folding*/ const InlinedHashSet& rules_and_transformers_to_disable, - concurrency::ThreadPool* intra_op_thread_pool) { + [[maybe_unused]] concurrency::ThreadPool* intra_op_thread_pool) { InlinedVector> transformers; const bool disable_quant_qdq = session_options.config_options.GetConfigOrDefault(kOrtSessionOptionsDisableQuantQDQ, "0") == "1"; @@ -419,7 +419,7 @@ InlinedVector> GenerateTransformersForMinimalB const SatApplyContextVariant& apply_context, const IExecutionProvider& cpu_execution_provider, const InlinedHashSet& rules_and_transformers_to_disable, - concurrency::ThreadPool* intra_op_thread_pool) { + [[maybe_unused]] concurrency::ThreadPool* intra_op_thread_pool) { InlinedVector> transformers; const bool saving = std::holds_alternative(apply_context); diff --git a/tools/ci_build/github/azure-pipelines/templates/c-api-linux-cpu.yml b/tools/ci_build/github/azure-pipelines/templates/c-api-linux-cpu.yml index 041ea623ecf61..e2b71c5c55fd2 100644 --- a/tools/ci_build/github/azure-pipelines/templates/c-api-linux-cpu.yml +++ b/tools/ci_build/github/azure-pipelines/templates/c-api-linux-cpu.yml @@ -69,7 +69,7 @@ jobs: docker run --rm --volume /data/onnx:/data/onnx:ro --volume $(Build.SourcesDirectory):/onnxruntime_src --volume $(Build.BinariesDirectory):/build \ --volume $HOME/.onnx:/home/onnxruntimedev/.onnx -e NIGHTLY_BUILD onnxruntimecpubuildcentos8${{parameters.OnnxruntimeArch}} /bin/bash -c "python3.9 \ /onnxruntime_src/tools/ci_build/build.py --enable_lto --build_java --build_nodejs --build_dir /build --config Release \ - --skip_submodule_sync --parallel --use_binskim_compliant_compile_flags --build_shared_lib ${{ parameters.AdditionalBuildFlags }} && cd /build/Release && make install DESTDIR=/build/linux-${{parameters.OnnxruntimeArch}}" + --skip_submodule_sync --parallel --use_binskim_compliant_compile_flags --build_shared_lib ${{ parameters.AdditionalBuildFlags }} && cd /build/Release && make install DESTDIR=/build/installed" workingDirectory: $(Build.SourcesDirectory) displayName: 'Build' diff --git a/tools/ci_build/github/azure-pipelines/templates/final-jar-testing.yml b/tools/ci_build/github/azure-pipelines/templates/final-jar-testing.yml index c9b7c01146981..abc96601ffb6c 100644 --- a/tools/ci_build/github/azure-pipelines/templates/final-jar-testing.yml +++ b/tools/ci_build/github/azure-pipelines/templates/final-jar-testing.yml @@ -68,8 +68,9 @@ stages: inputs: targetType: 'inline' script: | + set -e -x echo "Java Version" - java --version + java -version mkdir test pushd test jar xf '$(Build.BinariesDirectory)/final-jar/testing.jar' diff --git a/tools/ci_build/github/azure-pipelines/templates/mac-cpu-packaging-steps.yml b/tools/ci_build/github/azure-pipelines/templates/mac-cpu-packaging-steps.yml index 7672b604a5268..84f517a81686d 100644 --- a/tools/ci_build/github/azure-pipelines/templates/mac-cpu-packaging-steps.yml +++ b/tools/ci_build/github/azure-pipelines/templates/mac-cpu-packaging-steps.yml @@ -46,8 +46,11 @@ steps: ChangeEveryCommit: true BuildStep: - script: | + set -e -x rm -rf $(Build.BinariesDirectory)/Release python3 $(Build.SourcesDirectory)/tools/ci_build/build.py --update --build ${{ parameters.AdditionalBuildFlags }} --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --parallel --use_binskim_compliant_compile_flags --build_shared_lib --config Release + cd $(Build.BinariesDirectory)/Release + make install DESTDIR=$(Build.BinariesDirectory)/installed displayName: 'Build ${{ parameters.MacosArch }}' env: CCACHE_DIR: ${{ parameters.CacheDir }} diff --git a/tools/ci_build/github/linux/build_cuda_c_api_package.sh b/tools/ci_build/github/linux/build_cuda_c_api_package.sh index 04968aacdb255..57a3bedc1e8e4 100755 --- a/tools/ci_build/github/linux/build_cuda_c_api_package.sh +++ b/tools/ci_build/github/linux/build_cuda_c_api_package.sh @@ -1,10 +1,5 @@ #!/bin/bash set -e -x docker run --rm --volume \ -$BUILD_SOURCESDIRECTORY:/onnxruntime_src --volume $BUILD_BINARIESDIRECTORY:/build \ ---volume /data/models:/build/models:ro --volume /data/onnx:/data/onnx:ro -e NIGHTLY_BUILD onnxruntimecuda${CUDA_VERSION_MAJOR}build \ -/usr/bin/python3.9 /onnxruntime_src/tools/ci_build/build.py --enable_lto --build_java --build_nodejs --build_dir /build --config Release \ ---skip_submodule_sync --parallel --use_binskim_compliant_compile_flags --build_shared_lib --use_cuda --cuda_version=$CUDA_VERSION \ ---cuda_home=/usr/local/cuda-$CUDA_VERSION --cudnn_home=/usr/local/cuda-$CUDA_VERSION \ ---skip_tests \ ---cmake_extra_defines 'CMAKE_CUDA_ARCHITECTURES=60;61;70;75;80' +$BUILD_SOURCESDIRECTORY:/onnxruntime_src --volume $BUILD_BINARIESDIRECTORY:/build -e NIGHTLY_BUILD onnxruntimecuda${CUDA_VERSION_MAJOR}build \ +/bin/bash -c "/usr/bin/python3.9 /onnxruntime_src/tools/ci_build/build.py --enable_lto --build_java --build_nodejs --build_dir /build --config Release --skip_submodule_sync --parallel --use_binskim_compliant_compile_flags --build_shared_lib --use_cuda --cuda_version=$CUDA_VERSION --cuda_home=/usr/local/cuda-$CUDA_VERSION --cudnn_home=/usr/local/cuda-$CUDA_VERSION --skip_tests --cmake_extra_defines 'CMAKE_CUDA_ARCHITECTURES=60;61;70;75;80' && cd /build/Release && make install DESTDIR=/build/installed" diff --git a/tools/ci_build/github/linux/build_rocm_c_api_package.sh b/tools/ci_build/github/linux/build_rocm_c_api_package.sh index d70442ad2cae8..9fee565170a1b 100755 --- a/tools/ci_build/github/linux/build_rocm_c_api_package.sh +++ b/tools/ci_build/github/linux/build_rocm_c_api_package.sh @@ -31,14 +31,7 @@ docker run --rm \ --volume /data/onnx:/data/onnx:ro \ --workdir /onnxruntime_src \ $IMAGE \ - ${PYTHON_BIN:-python} /onnxruntime_src/tools/ci_build/build.py \ - --config Release \ - --build_dir /build \ - --parallel \ - --use_rocm --rocm_version=$ROCM_VERSION --rocm_home $ROCM_HOME --nccl_home $ROCM_HOME \ - --build_shared_lib \ - --skip_submodule_sync \ - --skip_tests --cmake_extra_defines FETCHCONTENT_TRY_FIND_PACKAGE_MODE=NEVER + /bin/bash -c "${PYTHON_BIN:-python} /onnxruntime_src/tools/ci_build/build.py --config Release --build_dir /build --parallel --use_rocm --use_binskim_compliant_compile_flags --rocm_version=$ROCM_VERSION --rocm_home $ROCM_HOME --nccl_home $ROCM_HOME --build_shared_lib --skip_submodule_sync --skip_tests --cmake_extra_defines FETCHCONTENT_TRY_FIND_PACKAGE_MODE=NEVER && cd /build/Release && make install DESTDIR=/build/installed" EXIT_CODE=$? diff --git a/tools/ci_build/github/linux/build_tensorrt_c_api_package.sh b/tools/ci_build/github/linux/build_tensorrt_c_api_package.sh index cc63b68d441d7..f0c9d51a53448 100755 --- a/tools/ci_build/github/linux/build_tensorrt_c_api_package.sh +++ b/tools/ci_build/github/linux/build_tensorrt_c_api_package.sh @@ -3,6 +3,4 @@ set -e -x mkdir -p $HOME/.onnx docker run --rm --volume /data/onnx:/data/onnx:ro --volume $BUILD_SOURCESDIRECTORY:/onnxruntime_src --volume $BUILD_BINARIESDIRECTORY:/build \ --volume /data/models:/build/models:ro --volume $HOME/.onnx:/home/onnxruntimedev/.onnx -e NIGHTLY_BUILD onnxruntimecuda${CUDA_VERSION_MAJOR}xtrt86build \ -/usr/bin/python3.9 /onnxruntime_src/tools/ci_build/build.py --build_dir /build --config Release \ ---skip_tests \ ---skip_submodule_sync --parallel --use_binskim_compliant_compile_flags --build_shared_lib --build_java --build_nodejs --use_tensorrt --cuda_version=$CUDA_VERSION --cuda_home=/usr/local/cuda-$CUDA_VERSION --cudnn_home=/usr --tensorrt_home=/usr --cmake_extra_defines 'CMAKE_CUDA_ARCHITECTURES=60;61;70;75;80' +/bin/bash -c "/usr/bin/python3.9 /onnxruntime_src/tools/ci_build/build.py --build_dir /build --config Release --skip_tests --skip_submodule_sync --parallel --use_binskim_compliant_compile_flags --build_shared_lib --build_java --build_nodejs --use_tensorrt --cuda_version=$CUDA_VERSION --cuda_home=/usr/local/cuda-$CUDA_VERSION --cudnn_home=/usr --tensorrt_home=/usr --cmake_extra_defines 'CMAKE_CUDA_ARCHITECTURES=60;61;70;75;80' && cd /build/Release && make install DESTDIR=/build/installed" diff --git a/tools/ci_build/github/linux/copy_strip_binary.sh b/tools/ci_build/github/linux/copy_strip_binary.sh index 65d6d97ebf0a8..f5b4c38c85d4c 100755 --- a/tools/ci_build/github/linux/copy_strip_binary.sh +++ b/tools/ci_build/github/linux/copy_strip_binary.sh @@ -16,46 +16,22 @@ done EXIT_CODE=1 uname -a -mkdir $BINARY_DIR/$ARTIFACT_NAME -mkdir $BINARY_DIR/$ARTIFACT_NAME/lib -mkdir $BINARY_DIR/$ARTIFACT_NAME/include -echo "Directories created" -cp $BINARY_DIR/$BUILD_CONFIG/$LIB_NAME $BINARY_DIR/$ARTIFACT_NAME/lib -if [[ -f "$BINARY_DIR/$BUILD_CONFIG/libonnxruntime_providers_cuda.so" ]]; then - cp $BINARY_DIR/$BUILD_CONFIG/libonnxruntime_providers_shared.so $BINARY_DIR/$ARTIFACT_NAME/lib - cp $BINARY_DIR/$BUILD_CONFIG/libonnxruntime_providers_cuda.so $BINARY_DIR/$ARTIFACT_NAME/lib -fi -if [[ -f "$BINARY_DIR/$BUILD_CONFIG/libonnxruntime_providers_tensorrt.so" ]]; then - cp $BINARY_DIR/$BUILD_CONFIG/libonnxruntime_providers_tensorrt.so $BINARY_DIR/$ARTIFACT_NAME/lib -fi -if [[ -f "$BINARY_DIR/$BUILD_CONFIG/libonnxruntime_providers_rocm.so" ]]; then - cp $BINARY_DIR/$BUILD_CONFIG/libonnxruntime_providers_shared.so $BINARY_DIR/$ARTIFACT_NAME/lib - cp $BINARY_DIR/$BUILD_CONFIG/libonnxruntime_providers_rocm.so $BINARY_DIR/$ARTIFACT_NAME/lib -fi +cd "$BINARY_DIR" +mv installed/usr/local $ARTIFACT_NAME +mv $ARTIFACT_NAME/include/onnxruntime/* $ARTIFACT_NAME/include +rmdir $ARTIFACT_NAME/include/onnxruntime +# Do not ship onnx_test_runner +rm -rf $ARTIFACT_NAME/bin echo "Copy debug symbols in a separate file and strip the original binary." if [[ $LIB_NAME == *.dylib ]] then dsymutil $BINARY_DIR/$ARTIFACT_NAME/lib/$LIB_NAME -o $BINARY_DIR/$ARTIFACT_NAME/lib/$LIB_NAME.dSYM strip -S $BINARY_DIR/$ARTIFACT_NAME/lib/$LIB_NAME - ln -s $LIB_NAME $BINARY_DIR/$ARTIFACT_NAME/lib/libonnxruntime.dylib # copy the CoreML EP header for macOS build (libs with .dylib ext) cp $SOURCE_DIR/include/onnxruntime/core/providers/coreml/coreml_provider_factory.h $BINARY_DIR/$ARTIFACT_NAME/include -elif [[ $LIB_NAME == *.so.* ]] -then - ln -s $LIB_NAME $BINARY_DIR/$ARTIFACT_NAME/lib/libonnxruntime.so -fi -cp $SOURCE_DIR/include/onnxruntime/core/session/onnxruntime_*.h $BINARY_DIR/$ARTIFACT_NAME/include -cp $SOURCE_DIR/include/onnxruntime/core/framework/provider_options.h $BINARY_DIR/$ARTIFACT_NAME/include -cp $SOURCE_DIR/include/onnxruntime/core/providers/cpu/cpu_provider_factory.h $BINARY_DIR/$ARTIFACT_NAME/include -cp $SOURCE_DIR/orttraining/orttraining/training_api/include/onnxruntime_training_*.h $BINARY_DIR/$ARTIFACT_NAME/include - -if [[ -f "$BINARY_DIR/$BUILD_CONFIG/libonnxruntime_providers_cuda.so" ]]; then -# copy headers for context context used in custom ops -mkdir -p $BINARY_DIR/$ARTIFACT_NAME/include/core/providers/cuda -cp $SOURCE_DIR/include/onnxruntime/core/providers/custom_op_context.h $BINARY_DIR/$ARTIFACT_NAME/include/core/providers/custom_op_context.h -cp $SOURCE_DIR/include/onnxruntime/core/providers/resource.h $BINARY_DIR/$ARTIFACT_NAME/include/core/providers/resource.h -cp $SOURCE_DIR/include/onnxruntime/core/providers/cuda/cuda_context.h $BINARY_DIR/$ARTIFACT_NAME/include/core/providers/cuda/cuda_context.h -cp $SOURCE_DIR/include/onnxruntime/core/providers/cuda/cuda_resource.h $BINARY_DIR/$ARTIFACT_NAME/include/core/providers/cuda/cuda_resource.h +else + # Linux + mv $ARTIFACT_NAME/lib64 $ARTIFACT_NAME/lib fi # copy the README, licence and TPN From eb9b377306c941b01d2823b7655f372a20b82197 Mon Sep 17 00:00:00 2001 From: Adrian Lizarraga Date: Wed, 24 Jul 2024 10:17:12 -0700 Subject: [PATCH 10/57] [QNN EP] Update to QNN SDK 2.24.0 (#21463) ### Description - Update pipelines to use QNN SDK 2.24 by default - Update QNN_Nuget_Windows pipeline to build csharp solution without mobile projects (fixes errors). - Implement workaround for QNN 2.24 validation bug for LayerNorm ops without an explicit bias input. - Enable Relu unit test, which now passes due to the fact Relu is no longer fused into QuantizeLinear for QNN EP. - Fix bug where a negative quantization axis is not properly normalized for per-channel int4 conv. ### Motivation and Context Update QNN SDk. --- .../opbuilder/layer_norm_op_builder.cc | 92 +++++++++++++++ .../qnn/builder/qnn_model_wrapper.cc | 10 ++ .../qnn/builder/qnn_quant_params_wrapper.cc | 107 +++++++++++++++++- .../qnn/builder/qnn_quant_params_wrapper.h | 7 ++ onnxruntime/test/providers/qnn/conv_test.cc | 38 ++++++- .../test/providers/qnn/layer_norm_test.cc | 79 +++++++++++-- .../test/providers/qnn/qnn_test_utils.h | 6 + .../test/providers/qnn/simple_op_htp_test.cc | 2 +- ...arm64-v8a-QNN-crosscompile-ci-pipeline.yml | 2 +- .../c-api-noopenmp-packaging-pipelines.yml | 2 +- .../azure-pipelines/linux-qnn-ci-pipeline.yml | 2 +- .../azure-pipelines/py-packaging-pipeline.yml | 2 +- .../qnn-ep-nuget-packaging-pipeline.yml | 2 +- .../templates/jobs/download_linux_qnn_sdk.yml | 2 +- .../templates/jobs/download_win_qnn_sdk.yml | 2 +- .../templates/py-packaging-stage.yml | 2 +- .../templates/py-win-arm64-qnn.yml | 2 +- .../templates/py-win-x64-qnn.yml | 2 +- .../azure-pipelines/templates/qnn-ep-win.yml | 6 +- .../win-qnn-arm64-ci-pipeline.yml | 2 +- .../azure-pipelines/win-qnn-ci-pipeline.yml | 2 +- 21 files changed, 339 insertions(+), 32 deletions(-) diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/layer_norm_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/layer_norm_op_builder.cc index a2dca669c24f6..c667aeeaa61f0 100644 --- a/onnxruntime/core/providers/qnn/builder/opbuilder/layer_norm_op_builder.cc +++ b/onnxruntime/core/providers/qnn/builder/opbuilder/layer_norm_op_builder.cc @@ -1,9 +1,11 @@ // Copyright (c) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. +#include #include "core/providers/common.h" #include "core/providers/shared/utils/utils.h" #include "core/framework/tensorprotoutils.h" +#include "core/providers/qnn/builder/qnn_utils.h" #include "core/providers/qnn/builder/qnn_model_wrapper.h" #include "core/providers/qnn/builder/op_builder_factory.h" #include "core/common/safeint.h" @@ -24,6 +26,11 @@ class LayerNormOpBuilder : public BaseOpBuilder { const logging::Logger& logger) const override final ORT_MUST_USE_RESULT; protected: + Status ProcessInputs(QnnModelWrapper& qnn_model_wrapper, + const NodeUnit& node_unit, + const logging::Logger& logger, + std::vector& input_names, + bool do_op_validation) const override ORT_MUST_USE_RESULT; Status ProcessAttributesAndOutputs(QnnModelWrapper& qnn_model_wrapper, const NodeUnit& node_unit, std::vector&& input_names, @@ -55,6 +62,91 @@ Status LayerNormOpBuilder::IsOpSupported(QnnModelWrapper& qnn_model_wrapper, return AddToModelBuilder(qnn_model_wrapper, node_unit, logger, true); } +Status LayerNormOpBuilder::ProcessInputs(QnnModelWrapper& qnn_model_wrapper, + const NodeUnit& node_unit, + const logging::Logger& logger, + std::vector& input_names, + bool do_op_validation) const { + ORT_UNUSED_PARAMETER(do_op_validation); + + const auto& inputs = node_unit.Inputs(); + const auto input_count = inputs.size(); + constexpr size_t X_IDX = 0; + constexpr size_t SCALE_IDX = 1; + constexpr size_t BIAS_IDX = 2; + + // Input[0] (X, required) + ORT_RETURN_IF_ERROR(ProcessInput(qnn_model_wrapper, inputs[X_IDX], logger, input_names)); + + // Input[1] (scale, required) + ORT_RETURN_IF_ERROR(ProcessInput(qnn_model_wrapper, inputs[SCALE_IDX], logger, input_names)); + + // Input[2] (bias, optional) + const bool has_bias_input = input_count > BIAS_IDX && inputs[BIAS_IDX].node_arg.Exists(); + if (has_bias_input) { + ORT_RETURN_IF_ERROR(ProcessInput(qnn_model_wrapper, inputs[BIAS_IDX], logger, input_names)); + } + +#if QNN_API_VERSION_MAJOR == 2 && QNN_API_VERSION_MINOR == 17 + if (!has_bias_input && IsNpuBackend(qnn_model_wrapper.GetQnnBackendType())) { + // Bias is implicit. QNN SDK 2.24 (QNN API version 2.17) has a validation bug for implicit bias inputs, so provide + // an explicit bias of all 0 (quantized int32). + TensorInfo x_input_info = {}; + ORT_RETURN_IF_ERROR(qnn_model_wrapper.GetTensorInfo(inputs[X_IDX], x_input_info)); + + TensorInfo scale_input_info = {}; + ORT_RETURN_IF_ERROR(qnn_model_wrapper.GetTensorInfo(inputs[SCALE_IDX], scale_input_info)); + + if (x_input_info.quant_param.IsPerTensor(/*include_bw*/ true) && scale_input_info.quant_param.IsQuantized()) { + const std::string bias_name = qnn::utils::GetNodeName(node_unit) + "_implicit_bias_ort_qnn_ep"; + + // Make dummy bias input have the same shape as the scale input. + std::vector bias_shape = scale_input_info.shape; + size_t num_bias_elems = 1; + for (size_t i = 0; i < bias_shape.size(); i++) { + num_bias_elems *= static_cast(bias_shape[i]); + } + + // Bias static input should be all zeros. + std::vector bias_bytes(num_bias_elems * sizeof(int32_t), 0); + + // Bias's quantization scale should be the product of the other inputs' quantization scales. + std::vector input0_quant_scales; + std::vector input1_quant_scales; + ORT_RETURN_IF_ERROR(x_input_info.quant_param.GetScales(input0_quant_scales)); + ORT_RETURN_IF_ERROR(scale_input_info.quant_param.GetScales(input1_quant_scales)); + + const size_t num_bias_scales_offsets = input1_quant_scales.size(); + assert(input0_quant_scales.size() == 1); // Expected for per-tensor. + ORT_RETURN_IF_NOT(num_bias_scales_offsets >= input0_quant_scales.size(), + "Input[1] should have >= 1 quantization scale values"); + + std::vector bias_scales(num_bias_scales_offsets); + for (size_t i = 0; i < num_bias_scales_offsets; i++) { + bias_scales[i] = input0_quant_scales[0] * input1_quant_scales[i]; + } + + std::vector bias_offsets(num_bias_scales_offsets, 0); // Bias's zero-points should be all zeros. + QnnQuantParamsWrapper bias_qparams; + + if (scale_input_info.quant_param.IsPerChannel()) { + bias_qparams = QnnQuantParamsWrapper(bias_scales, bias_offsets, /*axis*/ 0, /*is_int4*/ false); + } else { + bias_qparams = QnnQuantParamsWrapper(bias_scales[0], bias_offsets[0]); + } + + auto tensor_wrapper = QnnTensorWrapper(bias_name, QNN_TENSOR_TYPE_STATIC, QNN_DATATYPE_SFIXED_POINT_32, + std::move(bias_qparams), std::move(bias_shape), std::move(bias_bytes)); + + qnn_model_wrapper.AddTensorWrapper(std::move(tensor_wrapper)); + input_names.push_back(bias_name); + } + } +#endif + + return Status::OK(); +} + Status LayerNormOpBuilder::ProcessAttributesAndOutputs(QnnModelWrapper& qnn_model_wrapper, const NodeUnit& node_unit, std::vector&& input_names, diff --git a/onnxruntime/core/providers/qnn/builder/qnn_model_wrapper.cc b/onnxruntime/core/providers/qnn/builder/qnn_model_wrapper.cc index f85cdc401a152..c8537307ef3ba 100644 --- a/onnxruntime/core/providers/qnn/builder/qnn_model_wrapper.cc +++ b/onnxruntime/core/providers/qnn/builder/qnn_model_wrapper.cc @@ -442,6 +442,16 @@ Status QnnModelWrapper::IsPerChannelQuantized(const onnxruntime::NodeUnitIODef& if (is_per_channel) { axis = io_def.quant_param->axis.value_or(1); // 1 is default axis for Q/DQ ops. + if (axis < 0) { + // Normalize negative axis by adding rank. + const auto* tensor_shape_proto = io_def.node_arg.Shape(); + ORT_RETURN_IF_NOT(tensor_shape_proto != nullptr, "NULL tensor shape proto"); + + const int rank = tensor_shape_proto->dim_size(); + ORT_RETURN_IF_NOT(rank > 0, "Per-channel quantized tensor should be of rank > 0"); + + axis += rank; + } } return Status::OK(); diff --git a/onnxruntime/core/providers/qnn/builder/qnn_quant_params_wrapper.cc b/onnxruntime/core/providers/qnn/builder/qnn_quant_params_wrapper.cc index 2d22c3c1b8226..da2d517f65697 100644 --- a/onnxruntime/core/providers/qnn/builder/qnn_quant_params_wrapper.cc +++ b/onnxruntime/core/providers/qnn/builder/qnn_quant_params_wrapper.cc @@ -30,6 +30,7 @@ QnnQuantParamsWrapper& QnnQuantParamsWrapper::operator=(const QnnQuantParamsWrap return *this; } +// Construct per-tensor quantization params. QnnQuantParamsWrapper::QnnQuantParamsWrapper(float scale, int32_t offset) { params_.encodingDefinition = QNN_DEFINITION_DEFINED; params_.quantizationEncoding = QNN_QUANTIZATION_ENCODING_SCALE_OFFSET; @@ -37,6 +38,110 @@ QnnQuantParamsWrapper::QnnQuantParamsWrapper(float scale, int32_t offset) { params_.scaleOffsetEncoding.offset = offset; } +// Construct a per-channel quantization param. +QnnQuantParamsWrapper::QnnQuantParamsWrapper(gsl::span scales, gsl::span offsets, + int32_t axis, bool is_int4) { + assert(scales.size() == offsets.size()); // Logic error if sizes don't match. + const uint32_t num_elems = static_cast(scales.size()); + params_.encodingDefinition = QNN_DEFINITION_DEFINED; + + if (is_int4) { + params_.quantizationEncoding = QNN_QUANTIZATION_ENCODING_BW_AXIS_SCALE_OFFSET; + params_.bwAxisScaleOffsetEncoding.numElements = num_elems; + params_.bwAxisScaleOffsetEncoding.axis = axis; + params_.bwAxisScaleOffsetEncoding.bitwidth = 4; + + // Deep copy to the scales[] and offsets[] arrays + if (num_elems > 0) { + const size_t num_scale_bytes = num_elems * sizeof(float); + const size_t num_zp_bytes = num_elems * sizeof(int32_t); + const size_t num_bytes = num_scale_bytes + num_zp_bytes; + constexpr std::uintptr_t align = alignof(float); + static_assert(alignof(float) == alignof(int32_t)); + + per_channel_data_ = std::make_unique(num_bytes + align); + char* scales_begin = ALIGN_PTR_UP(per_channel_data_.get(), align, char*); + char* zps_begin = scales_begin + num_scale_bytes; + + std::memcpy(scales_begin, scales.data(), num_scale_bytes); + std::memcpy(zps_begin, offsets.data(), num_zp_bytes); + params_.bwAxisScaleOffsetEncoding.scales = reinterpret_cast(scales_begin); + params_.bwAxisScaleOffsetEncoding.offsets = reinterpret_cast(zps_begin); + } else { + params_.bwAxisScaleOffsetEncoding.scales = nullptr; + params_.bwAxisScaleOffsetEncoding.offsets = nullptr; + } + } else { + params_.quantizationEncoding = QNN_QUANTIZATION_ENCODING_AXIS_SCALE_OFFSET; + params_.axisScaleOffsetEncoding.numScaleOffsets = num_elems; + params_.axisScaleOffsetEncoding.axis = axis; + + // Deep copy to the scaleOffset data. + if (num_elems > 0) { + const size_t num_bytes = num_elems * sizeof(Qnn_ScaleOffset_t); + constexpr std::uintptr_t align = alignof(Qnn_ScaleOffset_t); + per_channel_data_ = std::make_unique(num_bytes + align); + Qnn_ScaleOffset_t* aligned_dst = ALIGN_PTR_UP(per_channel_data_.get(), align, Qnn_ScaleOffset_t*); + + for (size_t i = 0; i < static_cast(num_elems); i++) { + aligned_dst[i].offset = offsets[i]; + aligned_dst[i].scale = scales[i]; + } + + params_.axisScaleOffsetEncoding.scaleOffset = aligned_dst; + } else { + params_.axisScaleOffsetEncoding.scaleOffset = nullptr; + } + } +} + +// Get a copy of scales. Works for both per-tensor and per-channel. +Status QnnQuantParamsWrapper::GetScales(/*out*/ std::vector& scales) const { + ORT_RETURN_IF_NOT(params_.encodingDefinition == QNN_DEFINITION_DEFINED, "Unquantized qparams does not have scales"); + + switch (params_.quantizationEncoding) { + case QNN_QUANTIZATION_ENCODING_SCALE_OFFSET: + scales.resize(1); + scales[0] = params_.scaleOffsetEncoding.scale; + break; + case QNN_QUANTIZATION_ENCODING_BW_SCALE_OFFSET: + scales.resize(1); + scales[0] = params_.bwScaleOffsetEncoding.scale; + break; + case QNN_QUANTIZATION_ENCODING_AXIS_SCALE_OFFSET: { + const uint32_t num_elems = params_.axisScaleOffsetEncoding.numScaleOffsets; + scales.resize(num_elems); + + if (num_elems > 0) { + gsl::span scale_offsets(params_.axisScaleOffsetEncoding.scaleOffset, num_elems); + + for (size_t i = 0; i < num_elems; i++) { + scales[i] = scale_offsets[i].scale; + } + } + break; + } + case QNN_QUANTIZATION_ENCODING_BW_AXIS_SCALE_OFFSET: { + const uint32_t num_elems = params_.bwAxisScaleOffsetEncoding.numElements; + scales.resize(num_elems); + + // Deep copy the scales[] and offsets[] arrays + if (num_elems > 0) { + gsl::span src_scales(params_.bwAxisScaleOffsetEncoding.scales, num_elems); + for (size_t i = 0; i < num_elems; i++) { + scales[i] = src_scales[i]; + } + } + break; + } + default: + return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Unsupported QNN quantization encoding: ", + params_.quantizationEncoding); + } + + return Status::OK(); +} + QnnQuantParamsWrapper QnnQuantParamsWrapper::Copy() const { return QnnQuantParamsWrapper(*this); } @@ -199,7 +304,7 @@ Status QnnQuantParamsWrapper::Init(const QnnModelWrapper& qnn_model_wrapper, con params_.encodingDefinition = QNN_DEFINITION_DEFINED; params_.quantizationEncoding = QNN_QUANTIZATION_ENCODING_BW_AXIS_SCALE_OFFSET; - params_.bwAxisScaleOffsetEncoding.axis = static_cast(*(ort_quant_params->axis)); + params_.bwAxisScaleOffsetEncoding.axis = static_cast(axis); params_.bwAxisScaleOffsetEncoding.bitwidth = 4; params_.bwAxisScaleOffsetEncoding.numElements = static_cast(num_elems); diff --git a/onnxruntime/core/providers/qnn/builder/qnn_quant_params_wrapper.h b/onnxruntime/core/providers/qnn/builder/qnn_quant_params_wrapper.h index d1f93e5a692bc..23330f5616d73 100644 --- a/onnxruntime/core/providers/qnn/builder/qnn_quant_params_wrapper.h +++ b/onnxruntime/core/providers/qnn/builder/qnn_quant_params_wrapper.h @@ -3,6 +3,7 @@ #pragma once #include +#include #include "QnnTypes.h" #include "core/common/common.h" #include @@ -26,6 +27,9 @@ class QnnQuantParamsWrapper { // Construct a per-tensor quantization param (SCALE_OFFSET) QnnQuantParamsWrapper(float scale, int32_t offset); + // Construct a per-channel quantization param. + QnnQuantParamsWrapper(gsl::span scales, gsl::span offsets, int32_t axis, bool is_int4); + Qnn_QuantizeParams_t& Get() { return params_; } const Qnn_QuantizeParams_t& Get() const { return params_; } @@ -54,6 +58,9 @@ class QnnQuantParamsWrapper { (params_.quantizationEncoding == QNN_QUANTIZATION_ENCODING_BW_AXIS_SCALE_OFFSET)); } + // Get a copy of scales. Works for both per-tensor and per-channel. + Status GetScales(/*out*/ std::vector& scales) const; + // Handle transposing of a per-channel quantized tensor. The quantization parameter's axis // must be transposed using the inverse permutation of the Transpose. template diff --git a/onnxruntime/test/providers/qnn/conv_test.cc b/onnxruntime/test/providers/qnn/conv_test.cc index b07951d2a2e6d..99636976b9c05 100644 --- a/onnxruntime/test/providers/qnn/conv_test.cc +++ b/onnxruntime/test/providers/qnn/conv_test.cc @@ -178,10 +178,14 @@ static GetTestQDQModelFn BuildQDQPerChannelConvTestCase(const s ORT_ENFORCE(weights_def.IsInitializer() && weights_def.IsRawData()); std::vector weight_scales; std::vector weight_zero_points; + TensorShape weights_shape = weights_def.GetTensorShape(); + int64_t pos_weight_quant_axis = weight_quant_axis; + if (pos_weight_quant_axis < 0) { + pos_weight_quant_axis += static_cast(weights_shape.NumDimensions()); + } GetTestInputQuantParamsPerChannel(weights_def, weight_scales, weight_zero_points, - static_cast(weight_quant_axis), true); + static_cast(pos_weight_quant_axis), true); - TensorShape weights_shape = weights_def.GetTensorShape(); std::vector quantized_weights; size_t num_weight_storage_elems = weights_shape.Size(); if constexpr (std::is_same_v || std::is_same_v) { @@ -189,7 +193,7 @@ static GetTestQDQModelFn BuildQDQPerChannelConvTestCase(const s } quantized_weights.resize(num_weight_storage_elems); QuantizeValues(weights_def.GetRawData(), quantized_weights, weights_shape, - weight_scales, weight_zero_points, weight_quant_axis); + weight_scales, weight_zero_points, pos_weight_quant_axis); NodeArg* weights_initializer = builder.MakeInitializer(weights_def.GetShape(), quantized_weights); NodeArg* weights_dq = builder.MakeIntermediate(); @@ -760,6 +764,34 @@ TEST_F(QnnHTPBackendTests, ConvU16S4S32_PerChannel) { 21); // opset } +// Test per-channel QDQ Conv with INT4 weights and a negative weight quantization axis that still points to dimension 0. +TEST_F(QnnHTPBackendTests, ConvU16S4S32_PerChannel_NegativeWeightQuantAxis) { + std::vector input_shape = {1, 2, 4, 4}; + std::vector weight_shape = {3, 2, 2, 2}; + std::vector bias_shape = {3}; + + TestInputDef input_def(input_shape, false, + GetFloatDataInRange(0.0f, 1.0f, TensorShape(input_shape).Size())); + TestInputDef weight_def(weight_shape, true, + GetFloatDataInRange(-1.0f, 5.0f, TensorShape(weight_shape).Size())); + TestInputDef bias_def(bias_shape, true, + GetFloatDataInRange(-1.0f, 1.0f, TensorShape(bias_shape).Size())); + + RunHTPConvOpPerChannelTest("Conv", + input_def, + weight_def, + bias_def, + -4, // negative weight quant axis (same as 0) + {1, 1}, // Strides + {0, 0, 0, 0}, // Pads + {1, 1}, // Dilations + 1, // default group + "NOTSET", + ExpectedEPNodeAssignment::All, + false, // use_qdq_contrib_ops + 21); // opset +} + // Test per-channel QDQ Conv with INT4 weights. in0: u16, in1 (weight): s4, in2 (bias): s32, out: u8 // TODO(adrianlizarraga): Investigate inaccuracy for QNN EP. // diff --git a/onnxruntime/test/providers/qnn/layer_norm_test.cc b/onnxruntime/test/providers/qnn/layer_norm_test.cc index 7d129dceca582..2af49a5e500d2 100644 --- a/onnxruntime/test/providers/qnn/layer_norm_test.cc +++ b/onnxruntime/test/providers/qnn/layer_norm_test.cc @@ -79,25 +79,53 @@ TEST_F(QnnCPUBackendTests, LayerNorm3D) { template GetTestQDQModelFn BuildQDQLayerNormTestCase(const TestInputDef& input_def, const TestInputDef& scale_def, + const TestInputDef& bias_def, const std::vector& attrs, bool use_contrib_qdq_ops) { - return [input_def, scale_def, attrs, use_contrib_qdq_ops](ModelTestBuilder& builder, - std::vector>& output_qparams) { + return [input_def, scale_def, bias_def, attrs, + use_contrib_qdq_ops](ModelTestBuilder& builder, + std::vector>& output_qparams) { + std::vector layer_norm_inputs; + // input -> Q -> DQ -> NodeArg* input = MakeTestInput(builder, input_def); QuantParams input_qparams = GetTestInputQuantParams(input_def); NodeArg* input_qdq = AddQDQNodePair(builder, input, input_qparams.scale, input_qparams.zero_point, use_contrib_qdq_ops); + layer_norm_inputs.push_back(input_qdq); - // scale input -> Q -> DQ -> - NodeArg* scale = MakeTestInput(builder, scale_def); + NodeArg* scale_qdq = nullptr; QuantParams scale_qparams = GetTestInputQuantParams(scale_def); - NodeArg* scale_qdq = AddQDQNodePair(builder, scale, scale_qparams.scale, scale_qparams.zero_point, - use_contrib_qdq_ops); + + if (scale_def.IsInitializer() && scale_def.IsRawData()) { + // Quantized(scale weights) -> DQ -> + std::vector scale_scales = {scale_qparams.scale}; + std::vector scale_zps = {scale_qparams.zero_point}; + TensorShape scale_shape = scale_def.GetTensorShape(); + std::vector quantized_scales(scale_shape.Size()); + QuantizeValues(scale_def.GetRawData(), quantized_scales, scale_shape, + scale_scales, scale_zps, std::nullopt); + + NodeArg* scale_initzer = builder.MakeInitializer(scale_def.GetShape(), quantized_scales); + scale_qdq = builder.MakeIntermediate(); + builder.AddDequantizeLinearNode(scale_initzer, scale_scales, scale_zps, scale_qdq, + nullptr, use_contrib_qdq_ops); + } else { + // scale input -> Q -> DQ -> + NodeArg* scale = MakeTestInput(builder, scale_def); + scale_qdq = AddQDQNodePair(builder, scale, scale_qparams.scale, scale_qparams.zero_point, + use_contrib_qdq_ops); + } + layer_norm_inputs.push_back(scale_qdq); + + if (!bias_def.GetShape().empty()) { + const float bias_scale = input_qparams.scale * scale_qparams.scale; + layer_norm_inputs.push_back(MakeTestQDQBiasInput(builder, bias_def, bias_scale, use_contrib_qdq_ops)); + } // LayerNormalization NodeArg* layer_norm_output = builder.MakeIntermediate(); - Node& layer_norm_node = builder.AddNode("LayerNormalization", {input_qdq, scale_qdq}, {layer_norm_output}); + Node& layer_norm_node = builder.AddNode("LayerNormalization", layer_norm_inputs, {layer_norm_output}); for (const auto& attr : attrs) { layer_norm_node.AddAttributeProto(attr); @@ -114,6 +142,7 @@ GetTestQDQModelFn BuildQDQLayerNormTestCase(const TestInputDef static void RunLayerNormQDQTest(const TestInputDef& input_def, const TestInputDef& scale_def, + const TestInputDef& bias_def, const std::vector& attrs, ExpectedEPNodeAssignment expected_ep_assignment, bool use_contrib_qdq_ops = false) { @@ -125,7 +154,7 @@ static void RunLayerNormQDQTest(const TestInputDef& input_def, #endif TestQDQModelAccuracy(BuildOpTestCase("LayerNormalization", {input_def, scale_def}, {}, attrs), - BuildQDQLayerNormTestCase(input_def, scale_def, attrs, + BuildQDQLayerNormTestCase(input_def, scale_def, bias_def, attrs, use_contrib_qdq_ops), provider_options, 17, // opset @@ -136,6 +165,7 @@ static void RunLayerNormQDQTest(const TestInputDef& input_def, TEST_F(QnnHTPBackendTests, LayerNorm1D_Axis0_Unsupported) { RunLayerNormQDQTest(TestInputDef({1, 2, 3}, false, 0.0f, 10.0f), TestInputDef({1, 2, 3}, true, 0.0f, 10.0f), + TestInputDef(), {utils::MakeAttribute("axis", static_cast(0))}, // Unsupported axis ExpectedEPNodeAssignment::None); } @@ -143,16 +173,40 @@ TEST_F(QnnHTPBackendTests, LayerNorm1D_Axis0_Unsupported) { // Test accuracy of 8-bit QDQ LayerNorm with a static scale input. TEST_F(QnnHTPBackendTests, LayerNorm1D_LastAxis_StaticScale_AU8_WU8) { RunLayerNormQDQTest(TestInputDef({1, 2, 3}, false, GetFloatDataInRange(0.0f, 10.0f, 6)), - TestInputDef({3}, true, GetFloatDataInRange(0.0f, 1.0f, 3)), // Static - {utils::MakeAttribute("axis", static_cast(-1))}, // Last axis + TestInputDef({3}, true, GetFloatDataInRange(0.0f, 1.0f, 3)), + TestInputDef(), // Implicit bias input + {utils::MakeAttribute("axis", static_cast(-1))}, ExpectedEPNodeAssignment::All); } +// Test accuracy of 8-bit QDQ LayerNorm with a static scale input and an explicit bias input (static). +TEST_F(QnnHTPBackendTests, LayerNorm1D_LastAxis_StaticScale_StaticBias_AU8_WU8_BU8) { + RunLayerNormQDQTest(TestInputDef({1, 2, 3}, false, GetFloatDataInRange(0.0f, 10.0f, 6)), + TestInputDef({3}, true, GetFloatDataInRange(0.0f, 1.0f, 3)), + TestInputDef({3}, true, GetFloatDataInRange(0.0f, 1.0f, 3)), + {utils::MakeAttribute("axis", static_cast(-1))}, + ExpectedEPNodeAssignment::All); +} + +TEST_F(QnnHTPBackendTests, LayerNorm1D_QNN2_24_ImplicitBias_ValidationBug) { + // QNN 2.24 LayerNorm fails validation (intermittent) if the bias input is not provided. QNN EP will provide an + // explicit bias of all zeros to get around this bug. + for (size_t i = 0; i < 15; i++) { // Run it multiple times since this is an intermittent bug. + RunLayerNormQDQTest(TestInputDef({1, 2, 3}, false, GetFloatDataInRange(0.0f, 1.0f, 6)), + TestInputDef({3}, true, GetFloatDataInRange(0.0f, 1.0f, 3)), + TestInputDef(), // Implicit bias input + {utils::MakeAttribute("axis", static_cast(-1))}, + ExpectedEPNodeAssignment::All, + true); + } +} + // Test accuracy of 16-bit QDQ LayerNorm with a static scale input. TEST_F(QnnHTPBackendTests, LayerNorm1D_LastAxis_StaticScale_AU16_WU8) { RunLayerNormQDQTest(TestInputDef({1, 2, 3}, false, GetFloatDataInRange(0.0f, 10.0f, 6)), TestInputDef({3}, true, GetFloatDataInRange(0.0f, 1.0f, 3)), // Static - {utils::MakeAttribute("axis", static_cast(-1))}, // Last axis + TestInputDef(), + {utils::MakeAttribute("axis", static_cast(-1))}, // Last axis ExpectedEPNodeAssignment::All, true); // Use 'com.microsoft' Q/DQ ops } @@ -174,7 +228,8 @@ TEST_F(QnnHTPBackendTests, LayerNorm1D_LastAxis_StaticScale_AU16_WU8) { TEST_F(QnnHTPBackendTests, DISABLED_LayerNorm1D_LastAxis_DynamicScale) { RunLayerNormQDQTest(TestInputDef({1, 2, 3}, false, GetFloatDataInRange(0.0f, 10.0f, 6)), TestInputDef({3}, false, GetFloatDataInRange(0.0f, 1.0f, 3)), // Dynamic - {utils::MakeAttribute("axis", static_cast(-1))}, // Last axis + TestInputDef(), + {utils::MakeAttribute("axis", static_cast(-1))}, // Last axis ExpectedEPNodeAssignment::All); } diff --git a/onnxruntime/test/providers/qnn/qnn_test_utils.h b/onnxruntime/test/providers/qnn/qnn_test_utils.h index ad54e644af3f7..eb03270dc8461 100644 --- a/onnxruntime/test/providers/qnn/qnn_test_utils.h +++ b/onnxruntime/test/providers/qnn/qnn_test_utils.h @@ -517,6 +517,9 @@ inline void TestQDQModelAccuracy(const GetTestModelFn& f32_model_fn, const GetTe ASSERT_STATUS_OK(f32_model.MainGraph().Resolve()); f32_model.ToProto().SerializeToString(&f32_model_data); + // Uncomment to save f32 model to disk for debugging. + // ASSERT_STATUS_OK(onnxruntime::Model::Save(f32_model, ToPathString("cmp_accuracy.f32.onnx"))); + // Run f32 model on CPU EP and collect outputs. std::vector cpu_f32_outputs; InferenceModel(f32_model_data, "f32_model_logger", {}, ExpectedEPNodeAssignment::All, @@ -556,6 +559,9 @@ inline void TestQDQModelAccuracy(const GetTestModelFn& f32_model_fn, const GetTe ASSERT_STATUS_OK(qdq_model.MainGraph().Resolve()); qdq_model.ToProto().SerializeToString(&qdq_model_data); + // Uncomment to save QDQ model to disk for debugging. + // ASSERT_STATUS_OK(onnxruntime::Model::Save(qdq_model, ToPathString("cmp_accuracy.qdq.onnx"))); + bool is_qnn_ep = true; TryEnableQNNSaver(qnn_options); std::vector qnn_qdq_outputs; diff --git a/onnxruntime/test/providers/qnn/simple_op_htp_test.cc b/onnxruntime/test/providers/qnn/simple_op_htp_test.cc index f7dc5779ec5d9..2ebc2c6251b44 100644 --- a/onnxruntime/test/providers/qnn/simple_op_htp_test.cc +++ b/onnxruntime/test/providers/qnn/simple_op_htp_test.cc @@ -304,7 +304,7 @@ TEST_F(QnnHTPBackendTests, DISABLE_UnaryOp_Elu_U16) { // Expected val: 0 // QNN QDQ val: -10 (err 10) // CPU QDQ val: 0 (err 0) -TEST_F(QnnHTPBackendTests, DISABLED_UnaryOp_Relu) { +TEST_F(QnnHTPBackendTests, UnaryOp_Relu) { RunQDQOpTest("Relu", {TestInputDef({1, 2, 3}, false, GetFloatDataInRange(-10.0f, 10.0f, 6))}, {}, diff --git a/tools/ci_build/github/azure-pipelines/android-arm64-v8a-QNN-crosscompile-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/android-arm64-v8a-QNN-crosscompile-ci-pipeline.yml index a4a3d0e6b334b..6649206c0d79c 100644 --- a/tools/ci_build/github/azure-pipelines/android-arm64-v8a-QNN-crosscompile-ci-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/android-arm64-v8a-QNN-crosscompile-ci-pipeline.yml @@ -31,7 +31,7 @@ parameters: - name: QnnSdk displayName: QNN SDK version type: string - default: 2.23.0.240531 + default: 2.24.0.240626 jobs: - job: Build_QNN_EP diff --git a/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml b/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml index 700326fe9173c..2eb7046d80e7a 100644 --- a/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml +++ b/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml @@ -62,7 +62,7 @@ parameters: - name: QnnSdk displayName: QNN SDK Version type: string - default: 2.23.0.240531 + default: 2.24.0.240626 resources: repositories: diff --git a/tools/ci_build/github/azure-pipelines/linux-qnn-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-qnn-ci-pipeline.yml index 29ebf67dd3f91..0d67b0947be53 100644 --- a/tools/ci_build/github/azure-pipelines/linux-qnn-ci-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/linux-qnn-ci-pipeline.yml @@ -32,7 +32,7 @@ parameters: - name: QnnSdk displayName: QNN SDK version type: string - default: 2.23.0.240531 + default: 2.24.0.240626 jobs: - job: Build_QNN_EP diff --git a/tools/ci_build/github/azure-pipelines/py-packaging-pipeline.yml b/tools/ci_build/github/azure-pipelines/py-packaging-pipeline.yml index 8d1b6b7854e50..cd3966633d742 100644 --- a/tools/ci_build/github/azure-pipelines/py-packaging-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/py-packaging-pipeline.yml @@ -59,7 +59,7 @@ parameters: - name: qnn_sdk_version type: string displayName: 'QNN SDK version. Only for QNN packages.' - default: 2.23.0.240531 + default: 2.24.0.240626 trigger: none diff --git a/tools/ci_build/github/azure-pipelines/qnn-ep-nuget-packaging-pipeline.yml b/tools/ci_build/github/azure-pipelines/qnn-ep-nuget-packaging-pipeline.yml index a8b12637b70f3..7229bc5dbd114 100644 --- a/tools/ci_build/github/azure-pipelines/qnn-ep-nuget-packaging-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/qnn-ep-nuget-packaging-pipeline.yml @@ -2,7 +2,7 @@ parameters: - name: QnnSdk displayName: QNN SDK Version type: string - default: 2.23.0.240531 + default: 2.24.0.240626 - name: build_config displayName: Build Configuration diff --git a/tools/ci_build/github/azure-pipelines/templates/jobs/download_linux_qnn_sdk.yml b/tools/ci_build/github/azure-pipelines/templates/jobs/download_linux_qnn_sdk.yml index ada3603ae8476..734ad43e0066d 100644 --- a/tools/ci_build/github/azure-pipelines/templates/jobs/download_linux_qnn_sdk.yml +++ b/tools/ci_build/github/azure-pipelines/templates/jobs/download_linux_qnn_sdk.yml @@ -1,7 +1,7 @@ parameters: - name: QnnSDKVersion type: string - default: '2.23.0.240531' + default: '2.24.0.240626' steps: - script: | diff --git a/tools/ci_build/github/azure-pipelines/templates/jobs/download_win_qnn_sdk.yml b/tools/ci_build/github/azure-pipelines/templates/jobs/download_win_qnn_sdk.yml index 3a68803896ab3..900adc9690255 100644 --- a/tools/ci_build/github/azure-pipelines/templates/jobs/download_win_qnn_sdk.yml +++ b/tools/ci_build/github/azure-pipelines/templates/jobs/download_win_qnn_sdk.yml @@ -1,7 +1,7 @@ parameters: - name: QnnSDKVersion type: string - default: '2.23.0.240531' + default: '2.24.0.240626' steps: - powershell: | diff --git a/tools/ci_build/github/azure-pipelines/templates/py-packaging-stage.yml b/tools/ci_build/github/azure-pipelines/templates/py-packaging-stage.yml index 17e64a207be2f..447e35244eb66 100644 --- a/tools/ci_build/github/azure-pipelines/templates/py-packaging-stage.yml +++ b/tools/ci_build/github/azure-pipelines/templates/py-packaging-stage.yml @@ -63,7 +63,7 @@ parameters: - name: qnn_sdk_version type: string displayName: 'QNN SDK version. Only for QNN packages.' - default: 2.23.0.240531 + default: 2.24.0.240626 stages: - ${{ if eq(parameters.enable_windows_cpu, true) }}: diff --git a/tools/ci_build/github/azure-pipelines/templates/py-win-arm64-qnn.yml b/tools/ci_build/github/azure-pipelines/templates/py-win-arm64-qnn.yml index 70221976d978f..40e8583141df8 100644 --- a/tools/ci_build/github/azure-pipelines/templates/py-win-arm64-qnn.yml +++ b/tools/ci_build/github/azure-pipelines/templates/py-win-arm64-qnn.yml @@ -7,7 +7,7 @@ parameters: - name: QNN_SDK displayName: QNN SDK Version type: string - default: 2.23.0.240531 + default: 2.24.0.240626 - name: PYTHON_VERSION type: string diff --git a/tools/ci_build/github/azure-pipelines/templates/py-win-x64-qnn.yml b/tools/ci_build/github/azure-pipelines/templates/py-win-x64-qnn.yml index 1bf5db5ae6d9a..33335bb2be2dd 100644 --- a/tools/ci_build/github/azure-pipelines/templates/py-win-x64-qnn.yml +++ b/tools/ci_build/github/azure-pipelines/templates/py-win-x64-qnn.yml @@ -7,7 +7,7 @@ parameters: - name: QNN_SDK displayName: QNN SDK Version type: string - default: 2.23.0.240531 + default: 2.24.0.240626 - name: ENV_SETUP_SCRIPT type: string diff --git a/tools/ci_build/github/azure-pipelines/templates/qnn-ep-win.yml b/tools/ci_build/github/azure-pipelines/templates/qnn-ep-win.yml index b4c4f36c5dcc6..944745b69ca63 100644 --- a/tools/ci_build/github/azure-pipelines/templates/qnn-ep-win.yml +++ b/tools/ci_build/github/azure-pipelines/templates/qnn-ep-win.yml @@ -1,5 +1,5 @@ parameters: - QnnSdk: '2.23.0.240531' + QnnSdk: '2.24.0.240626' build_config: 'RelWithDebInfo' IsReleaseBuild: false DoEsrp: false @@ -103,7 +103,7 @@ stages: - task: MSBuild@1 displayName: 'Restore NuGet Packages and create project.assets.json' inputs: - solution: '$(Build.SourcesDirectory)\csharp\OnnxRuntime.CSharp.sln' + solution: '$(Build.SourcesDirectory)\csharp\OnnxRuntime.DesktopOnly.CSharp.sln' platform: 'Any CPU' configuration: ${{ parameters.build_config }} msbuildArguments: '-t:restore -p:OrtPackageId=$(OrtPackageId)' @@ -112,7 +112,7 @@ stages: - task: MSBuild@1 displayName: 'Build C# bindings' inputs: - solution: '$(Build.SourcesDirectory)\csharp\OnnxRuntime.CSharp.sln' + solution: '$(Build.SourcesDirectory)\csharp\OnnxRuntime.DesktopOnly.CSharp.sln' platform: 'Any CPU' configuration: ${{ parameters.build_config }} msbuildArguments: '-p:OnnxRuntimeBuildDirectory="$(Build.BinariesDirectory)" -p:OrtPackageId=$(OrtPackageId) -p:IsReleaseBuild=${{ parameters.IsReleaseBuild }}' diff --git a/tools/ci_build/github/azure-pipelines/win-qnn-arm64-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/win-qnn-arm64-ci-pipeline.yml index 97745fd09fbf7..e1b8b718e9928 100644 --- a/tools/ci_build/github/azure-pipelines/win-qnn-arm64-ci-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/win-qnn-arm64-ci-pipeline.yml @@ -32,7 +32,7 @@ parameters: - name: QnnSdk displayName: QNN SDK version type: string - default: 2.23.0.240531 + default: 2.24.0.240626 jobs: - job: 'build' diff --git a/tools/ci_build/github/azure-pipelines/win-qnn-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/win-qnn-ci-pipeline.yml index 2ab81e16cd57e..97c4ab15095c9 100644 --- a/tools/ci_build/github/azure-pipelines/win-qnn-ci-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/win-qnn-ci-pipeline.yml @@ -32,7 +32,7 @@ parameters: - name: QnnSdk displayName: QNN SDK version type: string - default: 2.23.0.240531 + default: 2.24.0.240626 jobs: - job: 'build' From c203d89958b4b0b23ce6967decc48afdd06b7ddb Mon Sep 17 00:00:00 2001 From: Justin Chu Date: Wed, 24 Jul 2024 11:50:11 -0700 Subject: [PATCH 11/57] Update ruff and clang-format versions (#21479) ruff -> 0.5.4 clang-format -> 18 --- cgmanifests/generate_cgmanifest.py | 2 +- .../tools/ValidateNativeDelegateAttributes.py | 2 +- include/onnxruntime/core/common/exceptions.h | 6 +- .../core/framework/stream_handles.h | 2 +- include/onnxruntime/core/platform/Barrier.h | 2 +- .../platform/EigenNonBlockingThreadPool.h | 10 +- .../core/providers/custom_op_context.h | 2 +- .../experimental_onnxruntime_cxx_api.h | 6 +- .../core/session/onnxruntime_cxx_api.h | 4 +- .../core/session/onnxruntime_lite_custom_op.h | 2 +- .../contrib_ops/cpu/cpu_contrib_kernels.cc | 134 +- .../contrib_ops/cpu/crop_and_resize.cc | 2 +- .../cuda/bert/flash_attention/alibi.h | 2 +- .../cuda/bert/flash_attention/mask.h | 2 +- .../cuda/bert/flash_attention/softmax.h | 2 +- .../contrib_ops/cuda/cuda_contrib_kernels.cc | 364 +- onnxruntime/core/framework/ex_lib_loader.h | 2 +- .../core/graph/contrib_ops/contrib_defs.cc | 4 +- .../transpose_optimization/optimizer_api.h | 8 +- onnxruntime/core/platform/path_lib.h | 6 +- .../core/providers/coreml/model/model.mm | 2 +- .../providers/cpu/cpu_execution_provider.cc | 3092 ++++++++--------- .../cpu/ml/tree_ensemble_classifier.cc | 18 +- .../core/providers/cpu/ml/treeregressor.cc | 18 +- .../object_detection/non_max_suppression.cc | 4 +- .../cpu/object_detection/roialign.cc | 6 +- .../core/providers/cpu/tensor/expand.cc | 10 +- .../providers/cuda/cuda_execution_provider.cc | 1788 +++++----- onnxruntime/core/providers/cuda/cuda_graph.h | 4 +- .../core/providers/cuda/cuda_profiler.h | 2 +- .../core/providers/cuda/nn/conv_transpose.h | 2 +- .../core/providers/cuda/nvtx_profile.h | 8 +- .../providers/cuda/shared_inc/cuda_utils.h | 2 +- .../core/providers/cuda/tensor/cast_op.cc | 32 +- .../providers/dnnl/dnnl_node_capability.h | 2 +- .../providers/dnnl/subgraph/dnnl_subgraph.h | 2 +- onnxruntime/core/providers/js/allocator.h | 2 +- onnxruntime/core/providers/js/data_transfer.h | 4 +- .../builder/opbuilder/expand_op_builder.cc | 2 +- .../qnn/builder/opbuilder/pad_op_builder.cc | 2 +- .../qnn/builder/qnn_quant_params_wrapper.cc | 2 +- .../core/providers/rocm/rocm_profiler.h | 2 +- .../shared_library/provider_host_api.h | 4 +- .../tensorrt_execution_provider_custom_ops.h | 4 +- .../vitisai/vitisai_provider_factory.cc | 2 +- .../builders/impl/elementwise_op_builder.h | 2 +- .../vsinpu/builders/op_builder_factory.h | 7 +- .../python/onnxruntime_pybind_iobinding.cc | 9 +- .../python/onnxruntime_pybind_ortvalue.cc | 136 +- .../onnxruntime_pybind_sparse_tensor.cc | 3 +- .../python/onnxruntime_pybind_state.cc | 88 +- onnxruntime/python/onnxruntime_validation.py | 17 +- .../tools/pytorch_export_contrib_ops.py | 2 +- .../python/tools/quantization/calibrate.py | 2 +- .../python/tools/symbolic_shape_infer.py | 44 +- .../python/tools/tensorrt/perf/benchmark.py | 12 +- .../python/tools/tensorrt/perf/perf_utils.py | 2 +- .../perf/setup_scripts/setup_onnx_zoo.py | 2 +- .../python/tools/transformers/benchmark.py | 2 +- .../tools/transformers/bert_test_data.py | 6 +- .../tools/transformers/fusion_attention.py | 2 +- .../python/tools/transformers/fusion_utils.py | 2 +- .../bart/utils/export_summarization_edinit.py | 2 +- .../export_summarization_enc_dec_past.py | 2 +- .../models/bart/utils/onnx_inference.py | 4 +- .../models/stable_diffusion/engine_builder.py | 2 +- .../pipeline_stable_diffusion.py | 4 +- .../test/framework/allocation_planner_test.cc | 2 +- onnxruntime/test/onnx/OrtValueList.h | 2 +- .../test/onnx/microbenchmark/activation.cc | 2 +- .../qdq_transformer_fastmath_test.cc | 2 +- .../test/optimizer/qdq_transformer_test.cc | 2 +- .../reduction_test_cases_generator.py | 8 +- .../test/providers/cpu/tensor/pad_test.cc | 12 +- .../test/providers/qnn/qnn_basic_test.cc | 10 +- .../test/python/onnx_backend_test_series.py | 2 +- .../test/python/transformers/rotary_flash.py | 3 - .../generate_tiny_keras2onnx_bert_models.py | 4 +- .../generate_tiny_gpt2_model.py | 4 +- onnxruntime/test/shared_lib/custom_op_utils.h | 20 +- onnxruntime/test/testdata/CNTK/gen.py | 4 +- .../core/framework/adasum/adasum_mpi.cc | 3 +- .../orttraining/core/framework/pipeline.h | 2 +- .../torch/custom_function_register.h | 2 +- .../orttraining/core/framework/torch/gil.h | 2 +- .../core/framework/torch/torch_proxy.h | 4 +- .../orttraining/core/graph/graph_augmenter.h | 6 +- .../core/graph/loss_func/loss_func_common.h | 2 +- .../core/graph/pipeline_transformer.cc | 2 +- .../core/optimizer/megatron_transformer.cc | 4 +- .../core/session/training_session.h | 6 +- orttraining/orttraining/lazy_tensor/flags.h | 2 +- orttraining/orttraining/models/bert/main.cc | 3 +- .../orttraining/models/pipeline_poc/main.cc | 54 +- .../orttraining/models/runner/training_util.h | 4 +- .../python/orttraining_pybind_state.cc | 24 +- .../python/training/ort_triton/kernel/_mm.py | 2 +- .../python/training/ortmodule/_utils.py | 2 +- .../cpu/torch_interop_utils/ctx_pool.h | 4 +- .../test/distributed/partition_utils.h | 2 +- ...orttraining_test_hierarchical_ortmodule.py | 2 +- .../orttraining_test_model_transform.py | 2 +- .../python/orttraining_test_ortmodule_api.py | 12 +- ...training_test_ortmodule_bert_classifier.py | 2 +- ...test_ortmodule_bert_classifier_autocast.py | 2 +- ...g_test_ortmodule_deepspeed_zero_stage_1.py | 2 +- .../orttraining_test_ortmodule_onnx_ops.py | 12 +- .../python/orttraining_test_ortmodule_poc.py | 2 +- .../test/python/orttraining_test_utilities.py | 4 +- .../training_ops/function_op_test_utils.cc | 2 +- .../cpu/torch/torch_custom_function_kernel.h | 2 +- .../cuda/cuda_training_kernels.cc | 466 +-- .../rocm/rocm_training_kernels.cc | 374 +- .../tools/scripts/gpt2_model_transform.py | 2 +- orttraining/tools/scripts/model_transform.py | 2 +- pyproject.toml | 1 + requirements-lintrunner.txt | 8 +- tools/ci_build/build.py | 22 +- tools/ci_build/gen_def.py | 10 +- tools/ci_build/reduce_op_kernels.py | 2 +- tools/ci_build/replace_urls_in_deps.py | 6 +- .../upload_python_package_to_azure_storage.py | 2 +- tools/doc/rename_folders.py | 14 +- .../nuget/generate_nuspec_for_native_nuget.py | 6 +- tools/python/onnx_test_data_utils.py | 2 +- .../util/mobile_helpers/usability_checker.py | 2 +- .../util/reduced_build_config_parser.py | 2 +- winml/lib/Api.Image/CpuDetensorizer.h | 9 +- winml/lib/Api.Image/CpuTensorizer.h | 12 +- winml/lib/Api.Image/D3DDeviceCache.cpp | 24 +- winml/lib/Api.Image/EventTimer.h | 4 +- .../lib/Api.Image/ImageConversionHelpers.cpp | 11 +- winml/lib/Api.Image/ImageConverter.cpp | 3 +- .../Api.Image/TensorToVideoFrameConverter.cpp | 13 +- .../Api.Image/VideoFrameToTensorConverter.cpp | 38 +- .../Api.Image/inc/ConverterResourceStore.h | 2 +- winml/lib/Api/FeatureValues.h | 82 +- winml/lib/Api/ImageFeatureValue.cpp | 14 +- winml/lib/Api/LearningModel.cpp | 2 +- winml/lib/Api/LearningModelSession.cpp | 4 +- winml/lib/Api/NumericData.cpp | 12 +- winml/lib/Api/impl/FeatureCompatibility.h | 6 +- winml/lib/Common/CommonDeviceHelpers.cpp | 6 +- ...er_backed_random_access_stream_reference.h | 5 +- winml/test/api/raw/winml_microsoft.h | 108 +- winml/test/api/raw/winml_windows.h | 112 +- winml/test/image/imagetests.cpp | 21 +- winml/test/model/compare_feature_value.cpp | 3 +- winml/test/model/model_tests.cpp | 10 +- winml/test/model/skip_model_tests.h | 6 +- winml/test/scenario/cppwinrt/NoisyReluCpu.h | 6 +- winml/test/scenario/cppwinrt/ReluCpu.h | 6 +- 152 files changed, 3781 insertions(+), 3842 deletions(-) diff --git a/cgmanifests/generate_cgmanifest.py b/cgmanifests/generate_cgmanifest.py index 3cecbb0cc977f..52bd3f58645f2 100644 --- a/cgmanifests/generate_cgmanifest.py +++ b/cgmanifests/generate_cgmanifest.py @@ -73,7 +73,7 @@ def add_github_dep(name, parsed_url): return # Make a REST call to convert to tag to a git commit url = f"https://api.github.com/repos/{org_name}/{repo_name}/git/refs/tags/{tag}" - print("requesting %s ..." % url) + print("requesting {url} ...") res = requests.get(url, auth=(args.username, args.token)) response_json = res.json() tag_object = response_json["object"] diff --git a/csharp/tools/ValidateNativeDelegateAttributes.py b/csharp/tools/ValidateNativeDelegateAttributes.py index acd6c173bfeb0..7431cc8d9d288 100644 --- a/csharp/tools/ValidateNativeDelegateAttributes.py +++ b/csharp/tools/ValidateNativeDelegateAttributes.py @@ -19,7 +19,7 @@ def check_all_delegates_have_unmanaged_function_pointer_attribute(file: pathlib. line_num = 0 with open(str(file.resolve(strict=True))) as f: prev_line = "" - for line in f.readlines(): + for line in f: line_num += 1 # strip so it's easier to deal with commented out lines. diff --git a/include/onnxruntime/core/common/exceptions.h b/include/onnxruntime/core/common/exceptions.h index 18c117f12ad7d..494a770b8db98 100644 --- a/include/onnxruntime/core/common/exceptions.h +++ b/include/onnxruntime/core/common/exceptions.h @@ -17,13 +17,13 @@ namespace onnxruntime { class NotImplementedException : public std::logic_error { public: - explicit NotImplementedException(const char* _Message = "Function not yet implemented") noexcept : std::logic_error(_Message){}; - explicit NotImplementedException(const std::string& _Message = "Function not yet implemented") noexcept : std::logic_error(_Message){}; + explicit NotImplementedException(const char* _Message = "Function not yet implemented") noexcept : std::logic_error(_Message) {}; + explicit NotImplementedException(const std::string& _Message = "Function not yet implemented") noexcept : std::logic_error(_Message) {}; }; class TypeMismatchException : public std::logic_error { public: - TypeMismatchException() noexcept : logic_error("Type mismatch"){}; + TypeMismatchException() noexcept : logic_error("Type mismatch") {}; }; class OnnxRuntimeException : public std::exception { diff --git a/include/onnxruntime/core/framework/stream_handles.h b/include/onnxruntime/core/framework/stream_handles.h index 9c987f10ccadb..01631e1fb2aa6 100644 --- a/include/onnxruntime/core/framework/stream_handles.h +++ b/include/onnxruntime/core/framework/stream_handles.h @@ -32,7 +32,7 @@ class Stream { return {}; }; // block the host thread until all the tasks in the stream finished. - virtual void Flush(){}; + virtual void Flush() {}; // The framework may reuse the stream instance for multiple iterations. // This is the API that provide a chance to let the device stream cleanup // resource at the end of a iteration. diff --git a/include/onnxruntime/core/platform/Barrier.h b/include/onnxruntime/core/platform/Barrier.h index 915cfc50953ed..1148b052bd9af 100644 --- a/include/onnxruntime/core/platform/Barrier.h +++ b/include/onnxruntime/core/platform/Barrier.h @@ -76,6 +76,6 @@ class Barrier { // Multiple threads can wait on the same Notification object, // but only one caller must call Notify() on the object. struct Notification : Barrier { - Notification() : Barrier(1){}; + Notification() : Barrier(1) {}; }; } // namespace onnxruntime diff --git a/include/onnxruntime/core/platform/EigenNonBlockingThreadPool.h b/include/onnxruntime/core/platform/EigenNonBlockingThreadPool.h index e33007102e198..d4411a6d72356 100644 --- a/include/onnxruntime/core/platform/EigenNonBlockingThreadPool.h +++ b/include/onnxruntime/core/platform/EigenNonBlockingThreadPool.h @@ -219,18 +219,18 @@ class ThreadPoolProfiler { WAIT_REVOKE, MAX_EVENT }; - ThreadPoolProfiler(int, const CHAR_TYPE*){}; + ThreadPoolProfiler(int, const CHAR_TYPE*) {}; ~ThreadPoolProfiler() = default; ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(ThreadPoolProfiler); - void Start(){}; + void Start() {}; std::string Stop() { return "not available for minimal build"; } - void LogStart(){}; + void LogStart() {}; void LogEnd(ThreadPoolEvent){}; void LogEndAndStart(ThreadPoolEvent){}; void LogStartAndCoreAndBlock(std::ptrdiff_t){}; void LogCoreAndBlock(std::ptrdiff_t){}; - void LogThreadId(int){}; - void LogRun(int){}; + void LogThreadId(int) {}; + void LogRun(int) {}; std::string DumpChildThreadStat() { return {}; } }; #else diff --git a/include/onnxruntime/core/providers/custom_op_context.h b/include/onnxruntime/core/providers/custom_op_context.h index 8f3d2476d4fdb..b10126da8e0fb 100644 --- a/include/onnxruntime/core/providers/custom_op_context.h +++ b/include/onnxruntime/core/providers/custom_op_context.h @@ -6,5 +6,5 @@ // CustomOpContext defines an interface allowing a custom op to access ep-specific resources. struct CustomOpContext { CustomOpContext() = default; - virtual ~CustomOpContext(){}; + virtual ~CustomOpContext() {}; }; \ No newline at end of file diff --git a/include/onnxruntime/core/session/experimental_onnxruntime_cxx_api.h b/include/onnxruntime/core/session/experimental_onnxruntime_cxx_api.h index 9e4ceffc44bfd..c1a7839ff22fa 100644 --- a/include/onnxruntime/core/session/experimental_onnxruntime_cxx_api.h +++ b/include/onnxruntime/core/session/experimental_onnxruntime_cxx_api.h @@ -24,9 +24,9 @@ namespace Experimental { struct Session : Ort::Session { Session(Env& env, std::basic_string& model_path, SessionOptions& options) - : Ort::Session(env, model_path.data(), options){}; + : Ort::Session(env, model_path.data(), options) {}; Session(Env& env, void* model_data, size_t model_data_length, SessionOptions& options) - : Ort::Session(env, model_data, model_data_length, options){}; + : Ort::Session(env, model_data, model_data_length, options) {}; // overloaded Run() with sensible defaults std::vector Run(const std::vector& input_names, @@ -52,7 +52,7 @@ struct Session : Ort::Session { struct Value : Ort::Value { Value(OrtValue* p) - : Ort::Value(p){}; + : Ort::Value(p) {}; template static Ort::Value CreateTensor(T* p_data, size_t p_data_element_count, const std::vector& shape); diff --git a/include/onnxruntime/core/session/onnxruntime_cxx_api.h b/include/onnxruntime/core/session/onnxruntime_cxx_api.h index 8091fd4cfc2a3..5d974e1ff5185 100644 --- a/include/onnxruntime/core/session/onnxruntime_cxx_api.h +++ b/include/onnxruntime/core/session/onnxruntime_cxx_api.h @@ -2175,8 +2175,8 @@ struct Op : detail::Base { /// struct ShapeInferContext { struct SymbolicInteger { - SymbolicInteger(int64_t i) : i_(i), is_int_(true){}; - SymbolicInteger(const char* s) : s_(s), is_int_(false){}; + SymbolicInteger(int64_t i) : i_(i), is_int_(true) {}; + SymbolicInteger(const char* s) : s_(s), is_int_(false) {}; SymbolicInteger(const SymbolicInteger&) = default; SymbolicInteger(SymbolicInteger&&) = default; diff --git a/include/onnxruntime/core/session/onnxruntime_lite_custom_op.h b/include/onnxruntime/core/session/onnxruntime_lite_custom_op.h index 57a64380faeb0..ce87d8c56d3fe 100644 --- a/include/onnxruntime/core/session/onnxruntime_lite_custom_op.h +++ b/include/onnxruntime/core/session/onnxruntime_lite_custom_op.h @@ -29,7 +29,7 @@ class ArgBase { ArgBase(OrtKernelContext* ctx, size_t indice, bool is_input) : ctx_(ctx), indice_(indice), is_input_(is_input) {} - virtual ~ArgBase(){}; + virtual ~ArgBase() {}; protected: struct KernelContext ctx_; diff --git a/onnxruntime/contrib_ops/cpu/cpu_contrib_kernels.cc b/onnxruntime/contrib_ops/cpu/cpu_contrib_kernels.cc index 90a51fda0b188..84f9ca88ecf55 100644 --- a/onnxruntime/contrib_ops/cpu/cpu_contrib_kernels.cc +++ b/onnxruntime/contrib_ops/cpu/cpu_contrib_kernels.cc @@ -267,83 +267,83 @@ Status RegisterQuantizationKernels(KernelRegistry& kernel_registry) { Status RegisterCpuContribKernels(KernelRegistry& kernel_registry) { static const BuildKernelCreateInfoFn function_table[] = { - BuildKernelCreateInfo, // default entry to avoid the list become empty after ops-reducing - BuildKernelCreateInfo, + BuildKernelCreateInfo, // default entry to avoid the list become empty after ops-reducing + BuildKernelCreateInfo, - // add more kernels here - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, + // add more kernels here + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, #if !defined(DISABLE_SPARSE_TENSORS) - BuildKernelCreateInfo, + BuildKernelCreateInfo, #endif - BuildKernelCreateInfo, - BuildKernelCreateInfo, // backward compatibility - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, // backward compatibility + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, #ifndef ORT_MINIMAL_BUILD - BuildKernelCreateInfo, + BuildKernelCreateInfo, #endif - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - // These ops were experimental ops in onnx domain which have been removed now. We add them here as - // contrib ops to main backward compatibility - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + // These ops were experimental ops in onnx domain which have been removed now. We add them here as + // contrib ops to main backward compatibility + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, #ifdef ENABLE_ATEN - BuildKernelCreateInfo, + BuildKernelCreateInfo, #endif #ifdef ENABLE_TRAINING_OPS - // Should remove the shrunken_gather include from ENABLE_TRAINING_OPS once 1). compute optimizer is enabled for inference or - // 2). this is needed by inference for other purpose. - BuildKernelCreateInfo, + // Should remove the shrunken_gather include from ENABLE_TRAINING_OPS once 1). compute optimizer is enabled for inference or + // 2). this is needed by inference for other purpose. + BuildKernelCreateInfo, #endif }; diff --git a/onnxruntime/contrib_ops/cpu/crop_and_resize.cc b/onnxruntime/contrib_ops/cpu/crop_and_resize.cc index 1863522c1643c..533d62f5e7486 100644 --- a/onnxruntime/contrib_ops/cpu/crop_and_resize.cc +++ b/onnxruntime/contrib_ops/cpu/crop_and_resize.cc @@ -173,7 +173,7 @@ void CropAndResizeForward(const TensorShape& output_shape, } } } // for pw - } // for ph + } // for ph }, 0); // for n } diff --git a/onnxruntime/contrib_ops/cuda/bert/flash_attention/alibi.h b/onnxruntime/contrib_ops/cuda/bert/flash_attention/alibi.h index 5d94190ecbeb9..18d36cfd88d60 100644 --- a/onnxruntime/contrib_ops/cuda/bert/flash_attention/alibi.h +++ b/onnxruntime/contrib_ops/cuda/bert/flash_attention/alibi.h @@ -17,7 +17,7 @@ struct Alibi { const int max_seqlen_k, max_seqlen_q; __forceinline__ __device__ Alibi(const float alibi_slope, const int max_seqlen_k, const int max_seqlen_q) - : alibi_slope(alibi_slope), max_seqlen_k(max_seqlen_k), max_seqlen_q(max_seqlen_q){}; + : alibi_slope(alibi_slope), max_seqlen_k(max_seqlen_k), max_seqlen_q(max_seqlen_q) {}; template __forceinline__ __device__ void apply_alibi(Tensor& tensor, diff --git a/onnxruntime/contrib_ops/cuda/bert/flash_attention/mask.h b/onnxruntime/contrib_ops/cuda/bert/flash_attention/mask.h index b225e5e3be559..0998155eba635 100644 --- a/onnxruntime/contrib_ops/cuda/bert/flash_attention/mask.h +++ b/onnxruntime/contrib_ops/cuda/bert/flash_attention/mask.h @@ -116,7 +116,7 @@ struct Mask { __forceinline__ __device__ Mask(const int max_seqlen_k, const int max_seqlen_q, const int window_size_left, const int window_size_right, const float alibi_slope = 0.f) - : max_seqlen_k(max_seqlen_k), max_seqlen_q(max_seqlen_q), window_size_left(window_size_left), window_size_right(window_size_right), alibi_slope(!Has_alibi ? 0.0 : alibi_slope){}; + : max_seqlen_k(max_seqlen_k), max_seqlen_q(max_seqlen_q), window_size_left(window_size_left), window_size_right(window_size_right), alibi_slope(!Has_alibi ? 0.0 : alibi_slope) {}; // Causal_mask: whether this particular iteration needs causal masking template diff --git a/onnxruntime/contrib_ops/cuda/bert/flash_attention/softmax.h b/onnxruntime/contrib_ops/cuda/bert/flash_attention/softmax.h index 3c205378f0177..ba678b740d376 100644 --- a/onnxruntime/contrib_ops/cuda/bert/flash_attention/softmax.h +++ b/onnxruntime/contrib_ops/cuda/bert/flash_attention/softmax.h @@ -121,7 +121,7 @@ struct Softmax { using TensorT = decltype(make_tensor(Shape>{})); TensorT row_max, row_sum; - __forceinline__ __device__ Softmax(){}; + __forceinline__ __device__ Softmax() {}; template __forceinline__ __device__ void softmax_rescale_o(Tensor0& acc_s, Tensor1& acc_o, float softmax_scale_log2) { diff --git a/onnxruntime/contrib_ops/cuda/cuda_contrib_kernels.cc b/onnxruntime/contrib_ops/cuda/cuda_contrib_kernels.cc index b237e5c24bbef..21bd5eb91c20f 100644 --- a/onnxruntime/contrib_ops/cuda/cuda_contrib_kernels.cc +++ b/onnxruntime/contrib_ops/cuda/cuda_contrib_kernels.cc @@ -231,206 +231,206 @@ KernelCreateInfo BuildKernelCreateInfo() { Status RegisterCudaContribKernels(KernelRegistry& kernel_registry) { static const BuildKernelCreateInfoFn function_table[] = { - BuildKernelCreateInfo, // default entry to avoid the list become empty after ops-reducing - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, // backward compatibility - BuildKernelCreateInfo, // backward compatibility - BuildKernelCreateInfo, // backward compatibility - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - - // These ops were experimental ops in onnx domain which have been removed now. We add them here as - // contrib ops to maintain backward compatibility - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - // TransposedMatMul is still here for backward compatibility - BuildKernelCreateInfo, // backward compatibility - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, + BuildKernelCreateInfo, // default entry to avoid the list become empty after ops-reducing + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, // backward compatibility + BuildKernelCreateInfo, // backward compatibility + BuildKernelCreateInfo, // backward compatibility + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + + // These ops were experimental ops in onnx domain which have been removed now. We add them here as + // contrib ops to maintain backward compatibility + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + // TransposedMatMul is still here for backward compatibility + BuildKernelCreateInfo, // backward compatibility + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, #ifdef ENABLE_ATEN - BuildKernelCreateInfo, + BuildKernelCreateInfo, #endif #ifdef ENABLE_TRAINING_OPS - // Should remove the shrunken_gather include from ENABLE_TRAINING_OPS once - // 1). compute optimizer is enabled for inference or - // 2). this is needed by inference for other purpose. - BuildKernelCreateInfo, + // Should remove the shrunken_gather include from ENABLE_TRAINING_OPS once + // 1). compute optimizer is enabled for inference or + // 2). this is needed by inference for other purpose. + BuildKernelCreateInfo, #endif #if defined(ORT_USE_NCCL) - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, #endif }; diff --git a/onnxruntime/core/framework/ex_lib_loader.h b/onnxruntime/core/framework/ex_lib_loader.h index cc353a7521786..d7ea5db3e5a26 100644 --- a/onnxruntime/core/framework/ex_lib_loader.h +++ b/onnxruntime/core/framework/ex_lib_loader.h @@ -20,7 +20,7 @@ class ExLibLoader { virtual ~ExLibLoader(); protected: - virtual void PreUnloadLibrary(void* /*handle*/){}; + virtual void PreUnloadLibrary(void* /*handle*/) {}; std::map dso_name_data_map_; diff --git a/onnxruntime/core/graph/contrib_ops/contrib_defs.cc b/onnxruntime/core/graph/contrib_ops/contrib_defs.cc index dea8775c89a30..2d51658953282 100644 --- a/onnxruntime/core/graph/contrib_ops/contrib_defs.cc +++ b/onnxruntime/core/graph/contrib_ops/contrib_defs.cc @@ -2665,10 +2665,10 @@ ONNX_MS_OPERATOR_SET_SCHEMA(CropAndResize, 1, #if !defined(DISABLE_FLOAT8_TYPES) #define GEMM_FLOAT8_TYPES \ - { "tensor(float8e4m3fn)", "tensor(float8e5m2)", "tensor(float16)", "tensor(bfloat16)", "tensor(float)" } + {"tensor(float8e4m3fn)", "tensor(float8e5m2)", "tensor(float16)", "tensor(bfloat16)", "tensor(float)"} #else #define GEMM_FLOAT8_TYPES \ - { "tensor(float16)", "tensor(bfloat16)", "tensor(float)" } + {"tensor(float16)", "tensor(bfloat16)", "tensor(float)"} #endif ONNX_MS_OPERATOR_SET_SCHEMA(GemmFloat8, 1, diff --git a/onnxruntime/core/optimizer/transpose_optimization/optimizer_api.h b/onnxruntime/core/optimizer/transpose_optimization/optimizer_api.h index c042bb0059ac2..e7d2d32809fc5 100644 --- a/onnxruntime/core/optimizer/transpose_optimization/optimizer_api.h +++ b/onnxruntime/core/optimizer/transpose_optimization/optimizer_api.h @@ -86,7 +86,7 @@ class TensorRef { /// Flattened tensor data in bytes virtual std::vector Data() const = 0; - virtual ~TensorRef(){}; + virtual ~TensorRef() {}; }; /// @@ -131,7 +131,7 @@ class ValueInfoRef { /// Indices of dimensions to add. Indices are relative to final shape. virtual void UnsqueezeDims(const std::vector& axes) = 0; - virtual ~ValueInfoRef(){}; + virtual ~ValueInfoRef() {}; }; /// @@ -248,7 +248,7 @@ class NodeRef { /// Id virtual int64_t Id() const = 0; - virtual ~NodeRef(){}; + virtual ~NodeRef() {}; }; /// @@ -449,7 +449,7 @@ class GraphRef { /// True if output of the Graph. virtual bool IsGraphOutput(std::string_view name) const = 0; - virtual ~GraphRef(){}; + virtual ~GraphRef() {}; }; } // namespace api diff --git a/onnxruntime/core/platform/path_lib.h b/onnxruntime/core/platform/path_lib.h index fca8990f14821..94425a3999d42 100644 --- a/onnxruntime/core/platform/path_lib.h +++ b/onnxruntime/core/platform/path_lib.h @@ -228,11 +228,9 @@ inline std::basic_string GetLastComponent(const std::basic_strin typename std::basic_string::size_type pos = input.length(); PATH_CHAR_TYPE sep = GetPathSep(); // remove trailing backslash - for (; pos > 1 && input[pos - 1] == sep; --pos) - ; + for (; pos > 1 && input[pos - 1] == sep; --pos); input.resize(pos); - for (; pos != 0 && input[pos - 1] != sep; --pos) - ; + for (; pos != 0 && input[pos - 1] != sep; --pos); return input.substr(pos); } diff --git a/onnxruntime/core/providers/coreml/model/model.mm b/onnxruntime/core/providers/coreml/model/model.mm index 4fd822f0d0d15..4d20061820e71 100644 --- a/onnxruntime/core/providers/coreml/model/model.mm +++ b/onnxruntime/core/providers/coreml/model/model.mm @@ -502,7 +502,7 @@ Status GetMLMultiArrayCopyInfo(const MLMultiArray* _Nonnull array, class Execution { public: Execution(const std::string& path, const logging::Logger& logger, uint32_t coreml_flags); - ~Execution(){}; + ~Execution() {}; Status LoadModel(); Status Predict(const std::unordered_map& inputs, diff --git a/onnxruntime/core/providers/cpu/cpu_execution_provider.cc b/onnxruntime/core/providers/cpu/cpu_execution_provider.cc index 9147107ac518a..7ac68e3a9a69d 100644 --- a/onnxruntime/core/providers/cpu/cpu_execution_provider.cc +++ b/onnxruntime/core/providers/cpu/cpu_execution_provider.cc @@ -1133,1568 +1133,1568 @@ KernelCreateInfo BuildKernelCreateInfo() { Status RegisterOnnxOperatorKernels(KernelRegistry& kernel_registry) { static const BuildKernelCreateInfoFn function_table[] = { - BuildKernelCreateInfo, // default entry to avoid the list become empty after ops-reducing - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - - // Opset 9 - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - - // Opset 10 - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - // opset 11 - BuildKernelCreateInfo, - - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - - // OpSet 12 - BuildKernelCreateInfo, - - BuildKernelCreateInfo, - - BuildKernelCreateInfo, - - BuildKernelCreateInfo, - - BuildKernelCreateInfo, - - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - // REVIEW(codemzs): ConstEigenVectorArrayMap.cast, - // BuildKernelCreateInfo, BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - - // opset 13 - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - - // OpSet 14 - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - - // Opset 15 - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, + BuildKernelCreateInfo, // default entry to avoid the list become empty after ops-reducing + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + + // Opset 9 + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + + // Opset 10 + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + // opset 11 + BuildKernelCreateInfo, + + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + + // OpSet 12 + BuildKernelCreateInfo, + + BuildKernelCreateInfo, + + BuildKernelCreateInfo, + + BuildKernelCreateInfo, + + BuildKernelCreateInfo, + + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + // REVIEW(codemzs): ConstEigenVectorArrayMap.cast, + // BuildKernelCreateInfo, BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + + // opset 13 + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + + // OpSet 14 + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + + // Opset 15 + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, #if !defined(DISABLE_OPTIONAL_TYPE) - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, #endif - // Opset 16 - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - - // Opset 17 - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - - // Opset 18 - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, + // Opset 16 + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + + // Opset 17 + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + + // Opset 18 + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, #if !defined(DISABLE_OPTIONAL_TYPE) - BuildKernelCreateInfo, - BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, #endif - // Opset 19 - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, + // Opset 19 + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, #if !defined(DISABLE_FLOAT8_TYPES) - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, #endif - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, #if !defined(DISABLE_FLOAT8_TYPES) - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, #endif - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - - // Opset 20 - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + + // Opset 20 + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, #if !defined(DISABLE_FLOAT8_TYPES) - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, #endif - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - - // Opset 21 - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + + // Opset 21 + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, #if !defined(DISABLE_FLOAT8_TYPES) - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, #endif - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, #if !defined(DISABLE_FLOAT8_TYPES) - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, #endif }; diff --git a/onnxruntime/core/providers/cpu/ml/tree_ensemble_classifier.cc b/onnxruntime/core/providers/cpu/ml/tree_ensemble_classifier.cc index 0c45b315f0280..758066d8a84e0 100644 --- a/onnxruntime/core/providers/cpu/ml/tree_ensemble_classifier.cc +++ b/onnxruntime/core/providers/cpu/ml/tree_ensemble_classifier.cc @@ -41,16 +41,16 @@ TreeEnsembleClassifier::TreeEnsembleClassifier(const OpKernelInfo& info) : Op template Status TreeEnsembleClassifier::GetRemovableAttributes(InlinedVector& removable_attributes) const { - InlinedVector names { - "base_values", "nodes_falsenodeids", "nodes_featureids", "nodes_hitrates", - "nodes_missing_value_tracks_true", "nodes_modes", "nodes_nodeids", "nodes_treeids", - "nodes_truenodeids", "nodes_values", "class_ids", "class_treeids", "class_nodeids", - "class_weights", "classlabels_strings", - "classlabels_int64s" + InlinedVector names{ + "base_values", "nodes_falsenodeids", "nodes_featureids", "nodes_hitrates", + "nodes_missing_value_tracks_true", "nodes_modes", "nodes_nodeids", "nodes_treeids", + "nodes_truenodeids", "nodes_values", "class_ids", "class_treeids", "class_nodeids", + "class_weights", "classlabels_strings", + "classlabels_int64s" #if !defined(ORT_MINIMAL_BUILD) - "base_values_as_tensor", - "nodes_hitrates_as_tensor", "nodes_values_as_tensor", - "class_weights_as_tensor" + "base_values_as_tensor", + "nodes_hitrates_as_tensor", "nodes_values_as_tensor", + "class_weights_as_tensor" #endif }; removable_attributes.swap(names); diff --git a/onnxruntime/core/providers/cpu/ml/treeregressor.cc b/onnxruntime/core/providers/cpu/ml/treeregressor.cc index 17f5cf32960da..6b5b972d3c929 100644 --- a/onnxruntime/core/providers/cpu/ml/treeregressor.cc +++ b/onnxruntime/core/providers/cpu/ml/treeregressor.cc @@ -48,16 +48,16 @@ TreeEnsembleRegressor::TreeEnsembleRegressor(const OpKernelInfo& info) : OpKe template Status TreeEnsembleRegressor::GetRemovableAttributes(InlinedVector& removable_attributes) const { - InlinedVector names { - "base_values", "nodes_falsenodeids", "nodes_featureids", "nodes_hitrates", - "nodes_missing_value_tracks_true", "nodes_modes", "nodes_nodeids", "nodes_treeids", - "nodes_truenodeids", "nodes_values", - "target_ids", "target_treeids", "target_nodeids", - "target_weights" + InlinedVector names{ + "base_values", "nodes_falsenodeids", "nodes_featureids", "nodes_hitrates", + "nodes_missing_value_tracks_true", "nodes_modes", "nodes_nodeids", "nodes_treeids", + "nodes_truenodeids", "nodes_values", + "target_ids", "target_treeids", "target_nodeids", + "target_weights" #if !defined(ORT_MINIMAL_BUILD) - "base_values_as_tensor", - "nodes_hitrates_as_tensor", "nodes_values_as_tensor", - "class_weights_as_tensor" + "base_values_as_tensor", + "nodes_hitrates_as_tensor", "nodes_values_as_tensor", + "class_weights_as_tensor" #endif }; removable_attributes.swap(names); diff --git a/onnxruntime/core/providers/cpu/object_detection/non_max_suppression.cc b/onnxruntime/core/providers/cpu/object_detection/non_max_suppression.cc index 4a176b0726a18..721c2064fae03 100644 --- a/onnxruntime/core/providers/cpu/object_detection/non_max_suppression.cc +++ b/onnxruntime/core/providers/cpu/object_detection/non_max_suppression.cc @@ -195,8 +195,8 @@ Status NonMaxSuppression::Compute(OpKernelContext* ctx) const { } sorted_boxes.pop(); } // while - } // for class_index - } // for batch_index + } // for class_index + } // for batch_index constexpr auto last_dim = 3; const auto num_selected = selected_indices.size(); diff --git a/onnxruntime/core/providers/cpu/object_detection/roialign.cc b/onnxruntime/core/providers/cpu/object_detection/roialign.cc index ead2ccaef002e..d8c81e5cb63e5 100644 --- a/onnxruntime/core/providers/cpu/object_detection/roialign.cc +++ b/onnxruntime/core/providers/cpu/object_detection/roialign.cc @@ -251,9 +251,9 @@ void RoiAlignForward(const TensorShape& output_shape, const T* bottom_data, floa top_data[index] = output_val; } // for pw - } // for ph - } // for c - } // for n + } // for ph + } // for c + } // for n }); } } // namespace diff --git a/onnxruntime/core/providers/cpu/tensor/expand.cc b/onnxruntime/core/providers/cpu/tensor/expand.cc index 6ead2ea73460b..b0c636281bc7a 100644 --- a/onnxruntime/core/providers/cpu/tensor/expand.cc +++ b/onnxruntime/core/providers/cpu/tensor/expand.cc @@ -128,7 +128,7 @@ Status Expand::Compute(OpKernelContext* context) const { memcpy(output_data + output_offset, input_data + input_offset, onnxruntime::narrow(copy_byte)); output_offsets[onnxruntime::narrow(i)] = output_offset; } // for i - }; // distribute_fn + }; // distribute_fn auto per_thread_tasks = distribute_count / concurrency::ThreadPool::DegreeOfParallelism(context->GetOperatorThreadPool()); @@ -169,9 +169,9 @@ Status Expand::Compute(OpKernelContext* context) const { copy_byte >>= 1; } } // while - } // if - } // for - }; // copy_fn + } // if + } // for + }; // copy_fn if (per_thread_tasks > 20) { concurrency::ThreadPool::TryParallelFor( context->GetOperatorThreadPool(), @@ -181,7 +181,7 @@ Status Expand::Compute(OpKernelContext* context) const { } else { copy_fn(0, onnxruntime::narrow(distribute_count)); } // else - } // for + } // for return Status::OK(); } // Expand::compute diff --git a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc index 8c03e489d298d..5771380433b35 100644 --- a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc +++ b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc @@ -1394,916 +1394,916 @@ KernelCreateInfo BuildKernelCreateInfo() { static Status RegisterCudaKernels(KernelRegistry& kernel_registry) { static const BuildKernelCreateInfoFn function_table[] = { - BuildKernelCreateInfo, // default entry to avoid the list become empty after ops-reducing - BuildKernelCreateInfo, - BuildKernelCreateInfo, + BuildKernelCreateInfo, // default entry to avoid the list become empty after ops-reducing + BuildKernelCreateInfo, + BuildKernelCreateInfo, #ifndef USE_CUDA_MINIMAL - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - - // opset 10 - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - - // opset 11 - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - - // OpSet 12 - BuildKernelCreateInfo, - - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - - BuildKernelCreateInfo, - - BuildKernelCreateInfo, - - BuildKernelCreateInfo, - BuildKernelCreateInfo, - - // OpSet 13 - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - - // OpSet 14 - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - - // OpSet 15 - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - - // Opset 16 - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - - // Opset 17 - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - - // Opset 18 - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - - // Opset 19 - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + + // opset 10 + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + + // opset 11 + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + + // OpSet 12 + BuildKernelCreateInfo, + + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + + BuildKernelCreateInfo, + + BuildKernelCreateInfo, + + BuildKernelCreateInfo, + BuildKernelCreateInfo, + + // OpSet 13 + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + + // OpSet 14 + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + + // OpSet 15 + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + + // Opset 16 + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + + // Opset 17 + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + + // Opset 18 + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + + // Opset 19 + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, #if !defined(DISABLE_FLOAT8_TYPES) - BuildKernelCreateInfo, - BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, #endif - BuildKernelCreateInfo, - BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, #if !defined(DISABLE_FLOAT8_TYPES) - BuildKernelCreateInfo, - BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, #endif - BuildKernelCreateInfo, - BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, #if !defined(DISABLE_FLOAT8_TYPES) - BuildKernelCreateInfo, - BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, #endif - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, #if !defined(DISABLE_FLOAT8_TYPES) - BuildKernelCreateInfo, - BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, #endif - BuildKernelCreateInfo, - BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, #if !defined(DISABLE_FLOAT8_TYPES) - BuildKernelCreateInfo, - BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, #endif - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, - // Opset 20 - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, + // Opset 20 + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, #endif }; diff --git a/onnxruntime/core/providers/cuda/cuda_graph.h b/onnxruntime/core/providers/cuda/cuda_graph.h index 064994c1f14ae..dd03db94b631c 100644 --- a/onnxruntime/core/providers/cuda/cuda_graph.h +++ b/onnxruntime/core/providers/cuda/cuda_graph.h @@ -18,7 +18,7 @@ constexpr CudaGraphAnnotation_t kCudaGraphAnnotationSkip = -1; constexpr CudaGraphAnnotation_t kCudaGraphAnnotationDefault = 0; struct CudaGraphSet { - CudaGraphSet(){}; + CudaGraphSet() {}; ~CudaGraphSet(); void Clear(); @@ -31,7 +31,7 @@ struct CudaGraphSet { }; struct CUDAGraphManager { - CUDAGraphManager(){}; + CUDAGraphManager() {}; CUDAGraphManager(cudaStream_t stream); ~CUDAGraphManager(); diff --git a/onnxruntime/core/providers/cuda/cuda_profiler.h b/onnxruntime/core/providers/cuda/cuda_profiler.h index 88c9adc5e17b3..4930e55351615 100644 --- a/onnxruntime/core/providers/cuda/cuda_profiler.h +++ b/onnxruntime/core/providers/cuda/cuda_profiler.h @@ -33,7 +33,7 @@ class CudaProfiler final : public EpProfiler { ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(CudaProfiler); ~CudaProfiler() {} bool StartProfiling(TimePoint) override { return true; } - void EndProfiling(TimePoint, Events&) override{}; + void EndProfiling(TimePoint, Events&) override {}; void Start(uint64_t) override{}; void Stop(uint64_t) override{}; }; diff --git a/onnxruntime/core/providers/cuda/nn/conv_transpose.h b/onnxruntime/core/providers/cuda/nn/conv_transpose.h index 77c9d94162b6b..71ad3ee6e2147 100644 --- a/onnxruntime/core/providers/cuda/nn/conv_transpose.h +++ b/onnxruntime/core/providers/cuda/nn/conv_transpose.h @@ -18,7 +18,7 @@ namespace cuda { template class ConvTranspose : public CudaKernel { public: - ConvTranspose(const OpKernelInfo& info) : CudaKernel(info), conv_transpose_attrs_(info){}; + ConvTranspose(const OpKernelInfo& info) : CudaKernel(info), conv_transpose_attrs_(info) {}; Status PrePack(const Tensor& tensor, int input_idx, AllocatorPtr alloc, bool& is_packed, [[maybe_unused]] PrePackedWeights* prepacked_weights) override; Status ComputeInternal(OpKernelContext* context) const override; diff --git a/onnxruntime/core/providers/cuda/nvtx_profile.h b/onnxruntime/core/providers/cuda/nvtx_profile.h index f98745cbfc5c2..e545578a72fc4 100644 --- a/onnxruntime/core/providers/cuda/nvtx_profile.h +++ b/onnxruntime/core/providers/cuda/nvtx_profile.h @@ -45,7 +45,7 @@ enum class Color : uint32_t { class RangeCreatorBase { public: RangeCreatorBase(const std::string message, const Color color) - : message_(message), color_(color), is_begin_called_(false), is_end_called_(false){}; + : message_(message), color_(color), is_begin_called_(false), is_end_called_(false) {}; // Check if Begin and End are both called. // It's pointless if not all of them are called. @@ -100,7 +100,7 @@ class RangeCreatorBase { class NvtxRangeCreator final : public RangeCreatorBase { public: NvtxRangeCreator(const std::string message, const Color color) - : RangeCreatorBase(message, color){}; + : RangeCreatorBase(message, color) {}; void BeginImpl() override; void EndImpl() override; @@ -114,7 +114,7 @@ class NvtxRangeCreator final : public RangeCreatorBase { class NvtxNestedRangeCreator final : public RangeCreatorBase { public: NvtxNestedRangeCreator(const std::string message, const Color color) - : RangeCreatorBase(message, color){}; + : RangeCreatorBase(message, color) {}; void BeginImpl() override; void EndImpl() override; @@ -123,7 +123,7 @@ class NvtxNestedRangeCreator final : public RangeCreatorBase { class NvtxMarkerCreator final { public: NvtxMarkerCreator(const std::string message, const Color color) - : message_(message), color_(color){}; + : message_(message), color_(color) {}; void Mark(); private: diff --git a/onnxruntime/core/providers/cuda/shared_inc/cuda_utils.h b/onnxruntime/core/providers/cuda/shared_inc/cuda_utils.h index 1f7df9b6fc2e3..ed642754af3ba 100644 --- a/onnxruntime/core/providers/cuda/shared_inc/cuda_utils.h +++ b/onnxruntime/core/providers/cuda/shared_inc/cuda_utils.h @@ -35,7 +35,7 @@ enum class BroadcastIndexType : int32_t { template class IConstantBuffer { public: - virtual ~IConstantBuffer(){}; + virtual ~IConstantBuffer() {}; virtual const T* GetBuffer(cudaStream_t stream, size_t count) = 0; }; diff --git a/onnxruntime/core/providers/cuda/tensor/cast_op.cc b/onnxruntime/core/providers/cuda/tensor/cast_op.cc index 8e5a68e2a278e..821695bbbd42f 100644 --- a/onnxruntime/core/providers/cuda/tensor/cast_op.cc +++ b/onnxruntime/core/providers/cuda/tensor/cast_op.cc @@ -13,23 +13,23 @@ const std::vector& CastOpTypeConstraints() { // Must be done as a local static for a shared provider, to avoid the prefast warning: // Global initializer calls a non-constexpr function 'onnxruntime::DataTypeImpl::GetTensorType' // In a shared provider, GetTensorType is a function call into Onnxruntime and isn't constexpr - static std::vector types { - DataTypeImpl::GetTensorType(), - DataTypeImpl::GetTensorType(), - DataTypeImpl::GetTensorType(), - DataTypeImpl::GetTensorType(), - DataTypeImpl::GetTensorType(), - DataTypeImpl::GetTensorType(), - DataTypeImpl::GetTensorType(), - DataTypeImpl::GetTensorType(), - DataTypeImpl::GetTensorType(), - DataTypeImpl::GetTensorType(), - DataTypeImpl::GetTensorType(), - DataTypeImpl::GetTensorType(), - DataTypeImpl::GetTensorType() + static std::vector types{ + DataTypeImpl::GetTensorType(), + DataTypeImpl::GetTensorType(), + DataTypeImpl::GetTensorType(), + DataTypeImpl::GetTensorType(), + DataTypeImpl::GetTensorType(), + DataTypeImpl::GetTensorType(), + DataTypeImpl::GetTensorType(), + DataTypeImpl::GetTensorType(), + DataTypeImpl::GetTensorType(), + DataTypeImpl::GetTensorType(), + DataTypeImpl::GetTensorType(), + DataTypeImpl::GetTensorType(), + DataTypeImpl::GetTensorType() #if !defined(DISABLE_FLOAT8_TYPES) - , - DataTypeImpl::GetTensorType(), DataTypeImpl::GetTensorType() + , + DataTypeImpl::GetTensorType(), DataTypeImpl::GetTensorType() #endif }; return types; diff --git a/onnxruntime/core/providers/dnnl/dnnl_node_capability.h b/onnxruntime/core/providers/dnnl/dnnl_node_capability.h index 3ed3705f6d81b..f67b70616547c 100644 --- a/onnxruntime/core/providers/dnnl/dnnl_node_capability.h +++ b/onnxruntime/core/providers/dnnl/dnnl_node_capability.h @@ -42,7 +42,7 @@ enum ORT_DataType : int { */ class DnnlNodeCapability { public: - virtual ~DnnlNodeCapability(){}; + virtual ~DnnlNodeCapability() {}; /** * virtual function expected to be implemented for different node * types. diff --git a/onnxruntime/core/providers/dnnl/subgraph/dnnl_subgraph.h b/onnxruntime/core/providers/dnnl/subgraph/dnnl_subgraph.h index ceac2a6f58b32..add9f440df91f 100644 --- a/onnxruntime/core/providers/dnnl/subgraph/dnnl_subgraph.h +++ b/onnxruntime/core/providers/dnnl/subgraph/dnnl_subgraph.h @@ -18,7 +18,7 @@ class DnnlNode; class DnnlNodeArg { public: DnnlNodeArg(DnnlNode* node, size_t index, bool is_output) - : node_(node), index_(index), is_output_(is_output){}; + : node_(node), index_(index), is_output_(is_output) {}; DnnlNodeArg() = default; DnnlNode* GetNode() { return node_; }; size_t GetIndex() { return index_; }; diff --git a/onnxruntime/core/providers/js/allocator.h b/onnxruntime/core/providers/js/allocator.h index 6aa8313c01f38..267015b2ea58d 100644 --- a/onnxruntime/core/providers/js/allocator.h +++ b/onnxruntime/core/providers/js/allocator.h @@ -15,7 +15,7 @@ class JsCPUAllocator : public CPUAllocator { : CPUAllocator( OrtMemoryInfo("JsCPUAllocator", OrtAllocatorType::OrtDeviceAllocator, OrtDevice(OrtDevice::CPU, OrtDevice::MemType::DEFAULT, 0), - 0, OrtMemTypeCPU)){}; + 0, OrtMemTypeCPU)) {}; }; class JsCustomAllocator : public IAllocator { diff --git a/onnxruntime/core/providers/js/data_transfer.h b/onnxruntime/core/providers/js/data_transfer.h index 3dfb19cfde5ac..6a0e8586776a2 100644 --- a/onnxruntime/core/providers/js/data_transfer.h +++ b/onnxruntime/core/providers/js/data_transfer.h @@ -11,8 +11,8 @@ namespace js { class DataTransfer : public IDataTransfer { public: - DataTransfer(){}; - ~DataTransfer(){}; + DataTransfer() {}; + ~DataTransfer() {}; bool CanCopy(const OrtDevice& src_device, const OrtDevice& dst_device) const override; diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/expand_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/expand_op_builder.cc index 9e31cf9cae21a..d0f6ce9effd9e 100644 --- a/onnxruntime/core/providers/qnn/builder/opbuilder/expand_op_builder.cc +++ b/onnxruntime/core/providers/qnn/builder/opbuilder/expand_op_builder.cc @@ -125,7 +125,7 @@ Status ExpandOpBuilder::ProcessInputs(QnnModelWrapper& qnn_model_wrapper, default: return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Type not supported."); } // switch - } // if-else + } // if-else const std::string& output_name = node_unit.Outputs()[0].node_arg.Name(); std::string shape_input_name(input_name + "_" + output_name); diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/pad_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/pad_op_builder.cc index b7455314578de..5fc6d42a8a179 100644 --- a/onnxruntime/core/providers/qnn/builder/opbuilder/pad_op_builder.cc +++ b/onnxruntime/core/providers/qnn/builder/opbuilder/pad_op_builder.cc @@ -163,7 +163,7 @@ Status ProcessConstantValue(QnnModelWrapper& qnn_model_wrapper, default: return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Type not supported."); } // switch - } // if-else + } // if-else QnnParamWrapper constant_value_param(node_unit.Index(), node_unit.Name(), diff --git a/onnxruntime/core/providers/qnn/builder/qnn_quant_params_wrapper.cc b/onnxruntime/core/providers/qnn/builder/qnn_quant_params_wrapper.cc index da2d517f65697..5fc4fb3db4122 100644 --- a/onnxruntime/core/providers/qnn/builder/qnn_quant_params_wrapper.cc +++ b/onnxruntime/core/providers/qnn/builder/qnn_quant_params_wrapper.cc @@ -10,7 +10,7 @@ #include "core/providers/qnn/builder/qnn_model_wrapper.h" #define ALIGN_PTR_UP(ptr, align, type) \ - reinterpret_cast((reinterpret_cast(ptr) + (align)-1) & ~((align)-1)) + reinterpret_cast((reinterpret_cast(ptr) + (align) - 1) & ~((align) - 1)) namespace onnxruntime { namespace qnn { diff --git a/onnxruntime/core/providers/rocm/rocm_profiler.h b/onnxruntime/core/providers/rocm/rocm_profiler.h index 070cca570f481..d5c7e3f273565 100644 --- a/onnxruntime/core/providers/rocm/rocm_profiler.h +++ b/onnxruntime/core/providers/rocm/rocm_profiler.h @@ -34,7 +34,7 @@ class RocmProfiler final : public EpProfiler { ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(RocmProfiler); ~RocmProfiler() {} bool StartProfiling(TimePoint) override { return true; } - void EndProfiling(TimePoint, Events&) override{}; + void EndProfiling(TimePoint, Events&) override {}; void Start(uint64_t) override{}; void Stop(uint64_t) override{}; }; diff --git a/onnxruntime/core/providers/shared_library/provider_host_api.h b/onnxruntime/core/providers/shared_library/provider_host_api.h index 43d661344d787..e25426b5124dd 100644 --- a/onnxruntime/core/providers/shared_library/provider_host_api.h +++ b/onnxruntime/core/providers/shared_library/provider_host_api.h @@ -24,10 +24,10 @@ struct Provider { virtual ProviderOptions GetProviderOptions(const void* /*provider options struct*/) { return {}; } // Update provider options from key-value string configuration - virtual void UpdateProviderOptions(void* /*provider options to be configured*/, const ProviderOptions& /*key-value string provider options*/){}; + virtual void UpdateProviderOptions(void* /*provider options to be configured*/, const ProviderOptions& /*key-value string provider options*/) {}; // Get provider specific custom op domain list. Provider has the resposibility to release OrtCustomOpDomain instances it creates. - virtual void GetCustomOpDomainList(IExecutionProviderFactory* /*pointer to factory instance*/, std::vector& /*provider custom op domain list*/){}; + virtual void GetCustomOpDomainList(IExecutionProviderFactory* /*pointer to factory instance*/, std::vector& /*provider custom op domain list*/) {}; virtual void Initialize() = 0; // Called right after loading the shared library, if this throws any errors Shutdown() will be called and the library unloaded virtual void Shutdown() = 0; // Called right before unloading the shared library diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_custom_ops.h b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_custom_ops.h index 54212d34aa2ce..a72de6ed75399 100644 --- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_custom_ops.h +++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_custom_ops.h @@ -24,8 +24,8 @@ struct TensorRTCustomKernel { : compute_stream_(compute_stream) { } - void Compute(OrtKernelContext* /*context*/){ - // The implementation is in TensorRT plugin. No need to implement it here. + void Compute(OrtKernelContext* /*context*/) { + // The implementation is in TensorRT plugin. No need to implement it here. }; private: diff --git a/onnxruntime/core/providers/vitisai/vitisai_provider_factory.cc b/onnxruntime/core/providers/vitisai/vitisai_provider_factory.cc index dc34419ef936f..453db30e1320f 100644 --- a/onnxruntime/core/providers/vitisai/vitisai_provider_factory.cc +++ b/onnxruntime/core/providers/vitisai/vitisai_provider_factory.cc @@ -46,7 +46,7 @@ struct VitisAI_Provider : Provider { } }; // Get provider specific custom op domain list. Provider has the resposibility to release OrtCustomOpDomain instances it creates. - void GetCustomOpDomainList(IExecutionProviderFactory*, std::vector&) override{}; + void GetCustomOpDomainList(IExecutionProviderFactory*, std::vector&) override {}; // Called right after loading the shared library, if this throws any errors Shutdown() will be called and the library unloaded void Initialize() override { initialize_vitisai_ep(); } // Called right before unloading the shared library diff --git a/onnxruntime/core/providers/vsinpu/builders/impl/elementwise_op_builder.h b/onnxruntime/core/providers/vsinpu/builders/impl/elementwise_op_builder.h index df2e429f58b2f..4c10ba01b1c2e 100644 --- a/onnxruntime/core/providers/vsinpu/builders/impl/elementwise_op_builder.h +++ b/onnxruntime/core/providers/vsinpu/builders/impl/elementwise_op_builder.h @@ -47,7 +47,7 @@ namespace npu { std::vector>& outputs, \ const NodeUnit& node_unit) override { \ LOGS_DEFAULT(INFO) << "Creating " << #onnx_op_type << " Op"; \ - auto op = graph_ep->GetGraph() -> CreateOperation(); \ + auto op = graph_ep->GetGraph()->CreateOperation(); \ (*op).BindInputs(inputs).BindOutputs(outputs); \ return true; \ ; \ diff --git a/onnxruntime/core/providers/vsinpu/builders/op_builder_factory.h b/onnxruntime/core/providers/vsinpu/builders/op_builder_factory.h index 27c148c1672c5..dc0969429b8ff 100644 --- a/onnxruntime/core/providers/vsinpu/builders/op_builder_factory.h +++ b/onnxruntime/core/providers/vsinpu/builders/op_builder_factory.h @@ -60,10 +60,9 @@ using createIOpBuildItemFunc = std::function()>; using OpBuildItemType = std::map>; static const std::map reg = { -#define REGISTER_OP_BUILDER(ONNX_NODE_TYPE, BUILDER_TYPE) \ - { \ - ONNX_NODE_TYPE, [] { return std::make_unique(); } \ - } +#define REGISTER_OP_BUILDER(ONNX_NODE_TYPE, BUILDER_TYPE) \ + { \ + ONNX_NODE_TYPE, [] { return std::make_unique(); }} REGISTER_OP_BUILDER("Add", AddOpBuilder), REGISTER_OP_BUILDER("Sub", SubOpBuilder), diff --git a/onnxruntime/python/onnxruntime_pybind_iobinding.cc b/onnxruntime/python/onnxruntime_pybind_iobinding.cc index 51a52dbfcb3bc..37081cd0ff2b4 100644 --- a/onnxruntime/python/onnxruntime_pybind_iobinding.cc +++ b/onnxruntime/python/onnxruntime_pybind_iobinding.cc @@ -155,11 +155,7 @@ void addIoBindingMethods(pybind11::module& m) { .def("clear_binding_outputs", [](SessionIOBinding* io_binding) -> void { io_binding->Get()->ClearOutputs(); }) - .def( - "get_outputs", [](const SessionIOBinding* io_binding) -> const std::vector& { - return io_binding->Get()->GetOutputs(); - }, - py::return_value_policy::reference_internal) + .def("get_outputs", [](const SessionIOBinding* io_binding) -> const std::vector& { return io_binding->Get()->GetOutputs(); }, py::return_value_policy::reference_internal) .def("copy_outputs_to_cpu", [](const SessionIOBinding* io_binding) -> py::list { const std::vector& outputs = io_binding->Get()->GetOutputs(); @@ -180,8 +176,7 @@ void addIoBindingMethods(pybind11::module& m) { } ++pos; } - return result; - }); + return result; }); } } // namespace python diff --git a/onnxruntime/python/onnxruntime_pybind_ortvalue.cc b/onnxruntime/python/onnxruntime_pybind_ortvalue.cc index 94235b3043bc7..d76b9032afe73 100644 --- a/onnxruntime/python/onnxruntime_pybind_ortvalue.cc +++ b/onnxruntime/python/onnxruntime_pybind_ortvalue.cc @@ -226,7 +226,7 @@ void addOrtValueMethods(pybind11::module& m) { ORT_THROW("Only OrtValues that are Tensors/SparseTensors are currently supported"); #else - ORT_THROW("Only OrtValues that are Tensors are supported in this build"); + ORT_THROW("Only OrtValues that are Tensors are supported in this build"); #endif }) .def("shape", [](const OrtValue* ort_value) -> py::list { @@ -275,26 +275,15 @@ void addOrtValueMethods(pybind11::module& m) { return *ONNX_NAMESPACE::Utils::DataTypeUtils::ToType(*type_proto); }) - .def( - "element_type", [](const OrtValue* ort_value) -> int32_t { - return GetTensorProtoType(*ort_value); - }, - "Returns an integer equal to the ONNX tensor proto type of the tensor or sequence. " - "This integer is one type defined by ONNX TensorProto_DataType " - "(such as onnx.TensorProto.FLOAT)." - "Raises an exception in any other case.") - .def("has_value", [](const OrtValue* ort_value) -> bool { - return ort_value->IsAllocated(); - }) - .def("is_tensor", [](const OrtValue* ort_value) -> bool { - return ort_value->IsTensor(); - }) - .def("is_sparse_tensor", [](const OrtValue* ort_value) -> bool { - return ort_value->IsSparseTensor(); - }) - .def("is_tensor_sequence", [](const OrtValue* ort_value) -> bool { - return ort_value->IsTensorSequence(); - }) + .def("element_type", [](const OrtValue* ort_value) -> int32_t { return GetTensorProtoType(*ort_value); }, + "Returns an integer equal to the ONNX tensor proto type of the tensor or sequence. " + "This integer is one type defined by ONNX TensorProto_DataType " + "(such as onnx.TensorProto.FLOAT)." + "Raises an exception in any other case.") + .def("has_value", [](const OrtValue* ort_value) -> bool { return ort_value->IsAllocated(); }) + .def("is_tensor", [](const OrtValue* ort_value) -> bool { return ort_value->IsTensor(); }) + .def("is_sparse_tensor", [](const OrtValue* ort_value) -> bool { return ort_value->IsSparseTensor(); }) + .def("is_tensor_sequence", [](const OrtValue* ort_value) -> bool { return ort_value->IsTensorSequence(); }) // Converts Tensor into a numpy array .def("numpy", [](const OrtValue* ml_value) -> py::object { ORT_ENFORCE(ml_value->IsTensor(), "Only OrtValues that are Tensors are convertible to Numpy objects"); @@ -310,37 +299,22 @@ void addOrtValueMethods(pybind11::module& m) { #else py::object obj = GetPyObjFromTensor(*ml_value, nullptr, nullptr); #endif - return obj; - }) + return obj; }) #ifdef ENABLE_TRAINING - .def( - "to_dlpack", [](OrtValue* ort_value) -> py::object { - return py::reinterpret_steal(ToDlpack(*ort_value)); - }, - "Returns a DLPack representing the tensor. This method does not copy the pointer shape, " - "instead, it copies the pointer value. The OrtValue must be persist until the dlpack structure " - "is consumed.") - .def_static( - "from_dlpack", [](py::object data, bool is_bool_tensor) { - return FromDlpack(data.ptr(), is_bool_tensor); - }, - py::arg("data"), py::arg("is_bool_tensor") = false, "Converts a tensor from a external library into an OrtValue by means of the __dlpack__ protocol.") - .def( - "__dlpack__", [](OrtValue* ort_value, py::object /* stream */) -> py::object { - return py::reinterpret_steal(ToDlpack(*ort_value)); - }, - py::arg("stream") = py::none(), - "Returns a DLPack representing the tensor (part of __dlpack__ protocol). " - "This method does not copy the pointer shape, instead, it copies the pointer value. " - "The OrtValue must persist until the dlpack structure is consumed.") - .def( - "__dlpack_device__", [](const OrtValue* ort_value) -> py::tuple { + .def("to_dlpack", [](OrtValue* ort_value) -> py::object { return py::reinterpret_steal(ToDlpack(*ort_value)); }, + "Returns a DLPack representing the tensor. This method does not copy the pointer shape, " + "instead, it copies the pointer value. The OrtValue must be persist until the dlpack structure " + "is consumed.") + .def_static("from_dlpack", [](py::object data, bool is_bool_tensor) { return FromDlpack(data.ptr(), is_bool_tensor); }, py::arg("data"), py::arg("is_bool_tensor") = false, "Converts a tensor from a external library into an OrtValue by means of the __dlpack__ protocol.") + .def("__dlpack__", [](OrtValue* ort_value, py::object /* stream */) -> py::object { return py::reinterpret_steal(ToDlpack(*ort_value)); }, py::arg("stream") = py::none(), + "Returns a DLPack representing the tensor (part of __dlpack__ protocol). " + "This method does not copy the pointer shape, instead, it copies the pointer value. " + "The OrtValue must persist until the dlpack structure is consumed.") + .def("__dlpack_device__", [](const OrtValue* ort_value) -> py::tuple { ORT_ENFORCE(ort_value->IsTensor(), "Only tensor type OrtValues are supported"); const onnxruntime::Tensor& tensor = ort_value->Get(); DLDevice device = onnxruntime::dlpack::GetDlpackDevice(*ort_value, tensor.Location().device.Id()); - return py::make_tuple(static_cast(device.device_type), device.device_id); - }, - "Returns a tuple of integers, (device, device index) (part of __dlpack__ protocol).") + return py::make_tuple(static_cast(device.device_type), device.device_id); }, "Returns a tuple of integers, (device, device index) (part of __dlpack__ protocol).") #endif ; @@ -350,13 +324,8 @@ void addOrtValueMethods(pybind11::module& m) { v->push_back(ortvalue); }) #ifdef ENABLE_TRAINING - .def( - "push_back", [](std::vector* v, py::object dlpack_tensor, const bool is_bool_tensor) { - v->push_back(FromDlpack(dlpack_tensor.ptr(), is_bool_tensor)); - }, - "Add a new OrtValue after being ownership was transferred from the DLPack structure.", py::arg("dlpack_tensor"), py::arg("is_bool_tensor") = false) - .def( - "push_back_batch", [](std::vector* v, std::vector& torch_tensors, std::vector& data_ptrs, std::vector& element_types, const std::vector>& shapes, const std::vector& devices) { + .def("push_back", [](std::vector* v, py::object dlpack_tensor, const bool is_bool_tensor) { v->push_back(FromDlpack(dlpack_tensor.ptr(), is_bool_tensor)); }, "Add a new OrtValue after being ownership was transferred from the DLPack structure.", py::arg("dlpack_tensor"), py::arg("is_bool_tensor") = false) + .def("push_back_batch", [](std::vector* v, std::vector& torch_tensors, std::vector& data_ptrs, std::vector& element_types, const std::vector>& shapes, const std::vector& devices) { for (size_t i = 0; i < torch_tensors.size(); ++i) { py::object& element_type = element_types.at(i); const std::vector& shape = shapes.at(i); @@ -377,52 +346,36 @@ void addOrtValueMethods(pybind11::module& m) { OrtValue ml_value; Tensor::InitOrtValue(ml_type, gsl::make_span(shape), reinterpret_cast(data_ptr), info, ml_value); v->push_back(ml_value); - } - }, - "Add a batch of OrtValue's by wrapping PyTorch tensors.") + } }, "Add a batch of OrtValue's by wrapping PyTorch tensors.") #endif .def("reserve", [](std::vector* v, const size_t len) { v->reserve(len); }) .def("shrink_to_fit", [](std::vector* v) { v->shrink_to_fit(); }) .def("__len__", [](const std::vector& v) { return v.size(); }) - .def( - "__iter__", [](const std::vector& v) { - return py::make_iterator(v.cbegin(), v.cend()); - }, - py::keep_alive<0, 1>()) - .def("__getitem__", [](const std::vector& v, const size_t idx) { - return v.at(idx); - }) - .def( - "bool_tensor_indices", [](std::vector* v) -> std::vector { + .def("__iter__", [](const std::vector& v) { return py::make_iterator(v.cbegin(), v.cend()); }, py::keep_alive<0, 1>()) + .def("__getitem__", [](const std::vector& v, const size_t idx) { return v.at(idx); }) + .def("bool_tensor_indices", [](std::vector* v) -> std::vector { std::vector indices; for (size_t i = 0; i < v->size(); ++i) { if (GetTensorProtoType((*v)[i]) == ONNX_NAMESPACE::TensorProto_DataType_BOOL) { indices.push_back(static_cast(i)); } } - return indices; - }, - "Returns the indices of every boolean tensor in this vector of OrtValue. " - "In case of a boolean tensor, method to_dlpacks returns a uint8 tensor instead of a boolean tensor. " - "If torch consumes the dlpack structure, `.to(torch.bool)` must be applied to the torch tensor " - "to get a boolean tensor.") + return indices; }, + "Returns the indices of every boolean tensor in this vector of OrtValue. " + "In case of a boolean tensor, method to_dlpacks returns a uint8 tensor instead of a boolean tensor. " + "If torch consumes the dlpack structure, `.to(torch.bool)` must be applied to the torch tensor " + "to get a boolean tensor.") #ifdef ENABLE_TRAINING - .def("dlpack_at", [](std::vector* v, const size_t idx) { - return py::reinterpret_steal(ToDlpack(v->at(idx))); - }) + .def("dlpack_at", [](std::vector* v, const size_t idx) { return py::reinterpret_steal(ToDlpack(v->at(idx))); }) #endif - .def( - "element_type_at", [](std::vector* v, const size_t idx) -> int32_t { - return GetTensorProtoType(v->at(idx)); - }, - "Returns an integer equal to the ONNX proto type of the tensor at position i. " - "This integer is one type defined by ONNX TensorProto_DataType " - "(such as onnx.TensorProto.FLOAT)." - "Raises an exception in any other case.", - py::arg("idx")) + .def("element_type_at", [](std::vector* v, const size_t idx) -> int32_t { return GetTensorProtoType(v->at(idx)); }, + "Returns an integer equal to the ONNX proto type of the tensor at position i. " + "This integer is one type defined by ONNX TensorProto_DataType " + "(such as onnx.TensorProto.FLOAT)." + "Raises an exception in any other case.", + py::arg("idx")) #ifdef ENABLE_TRAINING - .def( - "to_dlpacks", [](const std::vector& v, py::object to_tensor) -> py::list { + .def("to_dlpacks", [](const std::vector& v, py::object to_tensor) -> py::list { if (v.size() == 0) return py::list(); @@ -469,9 +422,8 @@ void addOrtValueMethods(pybind11::module& m) { Py_DECREF(capsule); } } - return list_dlpacks; - }, - R"pbdoc(Converts all OrtValue into tensors through DLPack protocol, the method creates + return list_dlpacks; }, + R"pbdoc(Converts all OrtValue into tensors through DLPack protocol, the method creates a DLPack structure for every tensors, then calls python function `to_tensor` to a new object consuming the DLPack structure or return a list of capsule if this function is None. @@ -488,7 +440,7 @@ It creates many tensors acquiring ownership of existing OrtValue. This method saves one object creation and an C++ allocation for every transferred tensor. )pbdoc", - py::arg("to_tensor")) + py::arg("to_tensor")) #endif ; diff --git a/onnxruntime/python/onnxruntime_pybind_sparse_tensor.cc b/onnxruntime/python/onnxruntime_pybind_sparse_tensor.cc index 7dcead113ac4f..1154f3b9f88b8 100644 --- a/onnxruntime/python/onnxruntime_pybind_sparse_tensor.cc +++ b/onnxruntime/python/onnxruntime_pybind_sparse_tensor.cc @@ -397,8 +397,7 @@ void addSparseTensorMethods(pybind11::module& m) { // pybind apparently has a bug with returning enums from def_property_readonly or methods // returning a method object instead of the enumeration value // so we are using def_property and throw on a potential modification - .def_property( - "format", [](const PySparseTensor* py_tensor) -> OrtSparseFormat { + .def_property("format", [](const PySparseTensor* py_tensor) -> OrtSparseFormat { const SparseTensor& tensor = py_tensor->Instance(); auto retval = OrtSparseFormat::ORT_SPARSE_UNDEFINED; switch (tensor.Format()) { diff --git a/onnxruntime/python/onnxruntime_pybind_state.cc b/onnxruntime/python/onnxruntime_pybind_state.cc index 6b5daf8cb882b..679ccce7fb07a 100644 --- a/onnxruntime/python/onnxruntime_pybind_state.cc +++ b/onnxruntime/python/onnxruntime_pybind_state.cc @@ -1425,7 +1425,7 @@ void addGlobalMethods(py::module& m) { ORT_UNUSED_PARAMETER(algo); ORT_THROW("set_cudnn_conv_algo_search is not supported in ROCM"); #else - cudnn_conv_algo_search = algo; + cudnn_conv_algo_search = algo; #endif }); // TODO remove deprecated global config @@ -1436,7 +1436,7 @@ void addGlobalMethods(py::module& m) { ORT_UNUSED_PARAMETER(use_single_stream); ORT_THROW("set_do_copy_in_default_stream is not supported in ROCM"); #else - do_copy_in_default_stream = use_single_stream; + do_copy_in_default_stream = use_single_stream; #endif }); // TODO remove deprecated global config @@ -1801,10 +1801,10 @@ Applies to session load, initialization, etc. Default is 0.)pbdoc") } ORT_THROW_IF_ERROR(options->value.AddExternalInitializers(names_ptrs, values_ptrs)); #else - ORT_UNUSED_PARAMETER(options); - ORT_UNUSED_PARAMETER(names); - ORT_UNUSED_PARAMETER(ort_values); - ORT_THROW("External initializers are not supported in this build."); + ORT_UNUSED_PARAMETER(options); + ORT_UNUSED_PARAMETER(names); + ORT_UNUSED_PARAMETER(ort_values); + ORT_THROW("External initializers are not supported in this build."); #endif }); @@ -1866,8 +1866,7 @@ including arg name, arg type (contains both type and shape).)pbdoc") return *(na.Type()); }, "node type") - .def( - "__str__", [](const onnxruntime::NodeArg& na) -> std::string { + .def("__str__", [](const onnxruntime::NodeArg& na) -> std::string { std::ostringstream res; res << "NodeArg(name='" << na.Name() << "', type='" << *(na.Type()) << "', shape="; auto shape = na.Shape(); @@ -1893,11 +1892,8 @@ including arg name, arg type (contains both type and shape).)pbdoc") } res << ")"; - return std::string(res.str()); - }, - "converts the node into a readable string") - .def_property_readonly( - "shape", [](const onnxruntime::NodeArg& na) -> std::vector { + return std::string(res.str()); }, "converts the node into a readable string") + .def_property_readonly("shape", [](const onnxruntime::NodeArg& na) -> std::vector { auto shape = na.Shape(); std::vector arr; if (shape == nullptr || shape->dim_size() == 0) { @@ -1914,9 +1910,7 @@ including arg name, arg type (contains both type and shape).)pbdoc") arr[i] = py::none(); } } - return arr; - }, - "node shape (assuming the node holds a tensor)"); + return arr; }, "node shape (assuming the node holds a tensor)"); py::class_ sessionObjectInitializer(m, "SessionObjectInitializer"); py::class_(m, "InferenceSession", R"pbdoc(This is the main class used to run a model.)pbdoc") @@ -2107,51 +2101,28 @@ including arg name, arg type (contains both type and shape).)pbdoc") .def_property_readonly("get_profiling_start_time_ns", [](const PyInferenceSession* sess) -> uint64_t { return sess->GetSessionHandle()->GetProfiling().GetStartTimeNs(); }) - .def( - "get_providers", [](const PyInferenceSession* sess) -> const std::vector& { - return sess->GetSessionHandle()->GetRegisteredProviderTypes(); - }, - py::return_value_policy::reference_internal) - .def( - "get_provider_options", [](const PyInferenceSession* sess) -> const ProviderOptionsMap& { - return sess->GetSessionHandle()->GetAllProviderOptions(); - }, - py::return_value_policy::reference_internal) - .def_property_readonly( - "session_options", [](const PyInferenceSession* sess) -> PySessionOptions* { + .def("get_providers", [](const PyInferenceSession* sess) -> const std::vector& { return sess->GetSessionHandle()->GetRegisteredProviderTypes(); }, py::return_value_policy::reference_internal) + .def("get_provider_options", [](const PyInferenceSession* sess) -> const ProviderOptionsMap& { return sess->GetSessionHandle()->GetAllProviderOptions(); }, py::return_value_policy::reference_internal) + .def_property_readonly("session_options", [](const PyInferenceSession* sess) -> PySessionOptions* { auto session_options = std::make_unique(); session_options->value = sess->GetSessionHandle()->GetSessionOptions(); - return session_options.release(); - }, - py::return_value_policy::take_ownership) - .def_property_readonly( - "inputs_meta", [](const PyInferenceSession* sess) -> const std::vector& { + return session_options.release(); }, py::return_value_policy::take_ownership) + .def_property_readonly("inputs_meta", [](const PyInferenceSession* sess) -> const std::vector& { auto res = sess->GetSessionHandle()->GetModelInputs(); OrtPybindThrowIfError(res.first); - return *(res.second); - }, - py::return_value_policy::reference_internal) - .def_property_readonly( - "outputs_meta", [](const PyInferenceSession* sess) -> const std::vector& { + return *(res.second); }, py::return_value_policy::reference_internal) + .def_property_readonly("outputs_meta", [](const PyInferenceSession* sess) -> const std::vector& { auto res = sess->GetSessionHandle()->GetModelOutputs(); OrtPybindThrowIfError(res.first); - return *(res.second); - }, - py::return_value_policy::reference_internal) - .def_property_readonly( - "overridable_initializers", [](const PyInferenceSession* sess) -> const std::vector& { + return *(res.second); }, py::return_value_policy::reference_internal) + .def_property_readonly("overridable_initializers", [](const PyInferenceSession* sess) -> const std::vector& { auto res = sess->GetSessionHandle()->GetOverridableInitializers(); OrtPybindThrowIfError(res.first); - return *(res.second); - }, - py::return_value_policy::reference_internal) - .def_property_readonly( - "model_meta", [](const PyInferenceSession* sess) -> const onnxruntime::ModelMetadata& { + return *(res.second); }, py::return_value_policy::reference_internal) + .def_property_readonly("model_meta", [](const PyInferenceSession* sess) -> const onnxruntime::ModelMetadata& { auto res = sess->GetSessionHandle()->GetModelMetadata(); OrtPybindThrowIfError(res.first); - return *(res.second); - }, - py::return_value_policy::reference_internal) + return *(res.second); }, py::return_value_policy::reference_internal) .def("run_with_iobinding", [](PyInferenceSession* sess, SessionIOBinding& io_binding, RunOptions* run_options = nullptr) -> void { Status status; // release GIL to allow multiple python threads to invoke Run() in parallel. @@ -2161,8 +2132,7 @@ including arg name, arg type (contains both type and shape).)pbdoc") else status = sess->GetSessionHandle()->Run(*run_options, *io_binding.Get()); if (!status.IsOK()) - throw std::runtime_error("Error in execution: " + status.ErrorMessage()); - }) + throw std::runtime_error("Error in execution: " + status.ErrorMessage()); }) .def("get_tuning_results", [](PyInferenceSession* sess) -> py::list { #if !defined(ORT_MINIMAL_BUILD) auto results = sess->GetSessionHandle()->GetTuningResults(); @@ -2177,8 +2147,8 @@ including arg name, arg type (contains both type and shape).)pbdoc") return ret; #else - ORT_UNUSED_PARAMETER(sess); - ORT_THROW("TunableOp and get_tuning_results are not supported in this build."); + ORT_UNUSED_PARAMETER(sess); + ORT_THROW("TunableOp and get_tuning_results are not supported in this build."); #endif }) .def("set_tuning_results", [](PyInferenceSession* sess, py::list results, bool error_on_invalid) -> void { @@ -2209,10 +2179,10 @@ including arg name, arg type (contains both type and shape).)pbdoc") throw std::runtime_error("Error in execution: " + status.ErrorMessage()); } #else - ORT_UNUSED_PARAMETER(sess); - ORT_UNUSED_PARAMETER(results); - ORT_UNUSED_PARAMETER(error_on_invalid); - ORT_THROW("TunableOp and set_tuning_results are not supported in this build."); + ORT_UNUSED_PARAMETER(sess); + ORT_UNUSED_PARAMETER(results); + ORT_UNUSED_PARAMETER(error_on_invalid); + ORT_THROW("TunableOp and set_tuning_results are not supported in this build."); #endif }); diff --git a/onnxruntime/python/onnxruntime_validation.py b/onnxruntime/python/onnxruntime_validation.py index 10d9f469863c4..81e6461e4417f 100644 --- a/onnxruntime/python/onnxruntime_validation.py +++ b/onnxruntime/python/onnxruntime_validation.py @@ -24,8 +24,7 @@ def check_distro_info(): if __my_distro_ver__ not in ["10", "11"]: warnings.warn( - "Unsupported Windows version (%s). ONNX Runtime supports Windows 10 and above, only." - % __my_distro_ver__ + f"Unsupported Windows version ({__my_distro_ver__}). ONNX Runtime supports Windows 10 and above, only." ) elif __my_system__ == "linux": """Although the 'platform' python module for getting Distro information works well on standard OS images @@ -54,11 +53,11 @@ def check_distro_info(): if int(__my_distro_ver__.split(".")[0]) < 11: warnings.warn( - "Unsupported macOS version (%s). ONNX Runtime supports macOS 11.0 or later." % (__my_distro_ver__) + f"Unsupported macOS version ({__my_distro_ver__}). ONNX Runtime supports macOS 11.0 or later." ) else: warnings.warn( - "Unsupported platform (%s). ONNX Runtime supports Linux, macOS and Windows platforms, only." % __my_system__ + f"Unsupported platform ({__my_system__}). ONNX Runtime supports Linux, macOS and Windows platforms, only." ) @@ -115,10 +114,10 @@ def validate_build_package_info(): cudart_version = None def print_build_package_info(): - warnings.warn("onnxruntime training package info: package_name: %s" % package_name) - warnings.warn("onnxruntime training package info: __version__: %s" % version) - warnings.warn("onnxruntime training package info: cuda_version: %s" % cuda_version) - warnings.warn("onnxruntime build info: cudart_version: %s" % cudart_version) + warnings.warn(f"onnxruntime training package info: package_name: {package_name}") + warnings.warn(f"onnxruntime training package info: __version__: {version}") + warnings.warn(f"onnxruntime training package info: cuda_version: {cuda_version}") + warnings.warn(f"onnxruntime build info: cudart_version: {cudart_version}") # collection cuda library info from current environment. from onnxruntime.capi.onnxruntime_collect_build_info import find_cudart_versions @@ -127,7 +126,7 @@ def print_build_package_info(): if cudart_version and local_cudart_versions and cudart_version not in local_cudart_versions: print_build_package_info() warnings.warn("WARNING: failed to find cudart version that matches onnxruntime build info") - warnings.warn("WARNING: found cudart versions: %s" % local_cudart_versions) + warnings.warn(f"WARNING: found cudart versions: {local_cudart_versions}") else: # TODO: rcom pass diff --git a/onnxruntime/python/tools/pytorch_export_contrib_ops.py b/onnxruntime/python/tools/pytorch_export_contrib_ops.py index aeb78f03dd721..d8cf3c1304219 100644 --- a/onnxruntime/python/tools/pytorch_export_contrib_ops.py +++ b/onnxruntime/python/tools/pytorch_export_contrib_ops.py @@ -22,7 +22,7 @@ def _reg(symbolic_fn: typing.Callable): - name = "::%s" % symbolic_fn.__name__ + name = f"::{symbolic_fn.__name__}" torch.onnx.register_custom_op_symbolic(name, symbolic_fn, _OPSET_VERSION) _registered_ops.add(name) diff --git a/onnxruntime/python/tools/quantization/calibrate.py b/onnxruntime/python/tools/quantization/calibrate.py index 65875d09102bd..703accbcc1c48 100644 --- a/onnxruntime/python/tools/quantization/calibrate.py +++ b/onnxruntime/python/tools/quantization/calibrate.py @@ -1076,7 +1076,7 @@ def get_entropy_threshold(self, histogram, num_quantized_bins): for i in range(num_half_quantized_bin, zero_bin_index + 1, 1): start_index = zero_bin_index - i - end_index = zero_bin_index + i + 1 if (zero_bin_index + i + 1) <= num_bins else num_bins + end_index = min(zero_bin_index + i + 1, num_bins) thresholds[i - num_half_quantized_bin] = (hist_edges[start_index], hist_edges[end_index]) diff --git a/onnxruntime/python/tools/symbolic_shape_infer.py b/onnxruntime/python/tools/symbolic_shape_infer.py index ac959d5c061f7..f88011c7a2cf9 100755 --- a/onnxruntime/python/tools/symbolic_shape_infer.py +++ b/onnxruntime/python/tools/symbolic_shape_infer.py @@ -24,7 +24,7 @@ def get_attribute(node, attr_name, default_value=None): def get_dim_from_proto(dim): - return getattr(dim, dim.WhichOneof("value")) if type(dim.WhichOneof("value")) is str else None # noqa: E721 + return getattr(dim, dim.WhichOneof("value")) if type(dim.WhichOneof("value")) is str else None def is_sequence(type_proto): @@ -92,19 +92,19 @@ def get_opset(mp, domain=None): def as_scalar(x): - if type(x) == list: # noqa: E721 + if type(x) is list: assert len(x) == 1 return x[0] - elif type(x) == np.ndarray: + elif type(x) is np.ndarray: return x.item() else: return x def as_list(x, keep_none): - if type(x) == list: # noqa: E721 + if type(x) is list: return x - elif type(x) == np.ndarray: + elif type(x) is np.ndarray: return list(x) elif keep_none and x is None: return None @@ -113,7 +113,7 @@ def as_list(x, keep_none): def sympy_reduce_product(x): - if type(x) == list: # noqa: E721 + if type(x) is list: value = sympy.Integer(1) for v in x: value = value * v @@ -258,7 +258,7 @@ def __init__(self, int_max, auto_merge, guess_output_rank, verbose, prefix=""): self.prefix_ = prefix def _add_suggested_merge(self, symbols, apply=False): - assert all([(type(s) == str and s in self.symbolic_dims_) or is_literal(s) for s in symbols]) # noqa: E721 + assert all([(type(s) is str and s in self.symbolic_dims_) or is_literal(s) for s in symbols]) symbols = set(symbols) for k, v in self.suggested_merge_.items(): if k in symbols: @@ -278,7 +278,7 @@ def _add_suggested_merge(self, symbols, apply=False): break if map_to is None: for s in symbols: - if type(self.symbolic_dims_[s]) == sympy.Symbol: + if type(self.symbolic_dims_[s]) is sympy.Symbol: map_to = s break # when nothing to map to, use the shorter one @@ -328,7 +328,7 @@ def _preprocess(self, in_mp): ) def _merge_symbols(self, dims): - if not all([type(d) == str for d in dims]): # noqa: E721 + if not all([type(d) is str for d in dims]): if self.auto_merge_: unique_dims = list(set(dims)) is_int = [is_literal(d) for d in unique_dims] @@ -408,7 +408,7 @@ def _get_shape_rank(self, node, idx): def _get_sympy_shape(self, node, idx): sympy_shape = [] for d in self._get_shape(node, idx): - if type(d) == str: # noqa: E721 + if type(d) is str: sympy_shape.append( self.symbolic_dims_[d] if d in self.symbolic_dims_ @@ -590,7 +590,7 @@ def _onnx_infer_subgraph(self, node, subgraph, use_node_input=True, inc_subgraph # for new symbolic dims from subgraph output, add to main graph symbolic dims subgraph_shapes = [get_shape_from_value_info(o) for o in symbolic_shape_inference.out_mp_.graph.output] subgraph_new_symbolic_dims = { - d for s in subgraph_shapes if s for d in s if type(d) == str and d not in self.symbolic_dims_ # noqa: E721 + d for s in subgraph_shapes if s for d in s if type(d) is str and d not in self.symbolic_dims_ } new_dims = {} for d in subgraph_new_symbolic_dims: @@ -610,7 +610,7 @@ def int_or_float(value, allow_float_values): if all([v is not None for v in values]): # some shape compute is in floating point, cast to int for sympy for i, v in enumerate(values): - if type(v) != np.ndarray: + if type(v) is not np.ndarray: continue if len(v.shape) > 1: new_v = None # ignore value for rank > 1 @@ -924,7 +924,7 @@ def _infer_Concat(self, node): # noqa: N802 if all([d == dims[0] for d in dims]): continue merged = self._merge_symbols(dims) - if type(merged) == str: # noqa: E721 + if type(merged) is str: sympy_shape[d] = self.symbolic_dims_[merged] if merged else None else: sympy_shape[d] = merged @@ -1060,7 +1060,7 @@ def _infer_Einsum(self, node): # noqa: N802 dim = shape[-i] if letter not in letter_to_dim: letter_to_dim[letter] = dim - elif type(dim) != sympy.Symbol: + elif type(dim) is not sympy.Symbol: letter_to_dim[letter] = dim num_operands = num_operands + 1 @@ -1127,8 +1127,8 @@ def _infer_Gather(self, node): # noqa: N802 idx = self._try_get_value(node, 1) if idx is not None: data = self.sympy_data_[node.input[0]] - if type(data) == list: # noqa: E721 - if type(idx) == np.ndarray and len(idx.shape) == 1: + if type(data) is list: + if type(idx) is np.ndarray and len(idx.shape) == 1: self.sympy_data_[node.output[0]] = [data[int(i)] for i in idx] else: self.sympy_data_[node.output[0]] = data[int(idx)] @@ -1530,7 +1530,7 @@ def _infer_aten_upsample(self, node): new_shape = input_shape[:2] output_size = self._try_get_value(node, 1) if output_size is not None: - new_shape += [dim_size.item() if type(dim_size) == np.int64 else dim_size for dim_size in output_size] + new_shape += [dim_size.item() if type(dim_size) is np.int64 else dim_size for dim_size in output_size] else: rank = len(input_shape) new_shape += [str(self._new_symbolic_dim_from_output(node, 0, i)) for i in range(2, rank)] @@ -1645,7 +1645,7 @@ def _infer_Reshape(self, node): # noqa: N802 deferred_dim_idx = -1 non_deferred_size = 1 for i, d in enumerate(shape_value): - if type(d) == sympy.Symbol: + if type(d) is sympy.Symbol: new_sympy_shape.append(d) elif d == 0: new_sympy_shape.append(input_sympy_shape[i]) @@ -1940,7 +1940,7 @@ def handle_negative_index(index, bound): # handle sympy_data if needed, for slice in shape computation if ( node.input[0] in self.sympy_data_ - and [0] == axes + and axes == [0] and starts is not None and len(starts) == 1 and ends is not None @@ -1949,8 +1949,8 @@ def handle_negative_index(index, bound): and len(steps) == 1 ): input_sympy_data = self.sympy_data_[node.input[0]] - if type(input_sympy_data) == list or ( # noqa: E721 - type(input_sympy_data) == np.array and len(input_sympy_data.shape) == 1 + if type(input_sympy_data) is list or ( + type(input_sympy_data) is np.array and len(input_sympy_data.shape) == 1 ): self.sympy_data_[node.output[0]] = input_sympy_data[starts[0] : ends[0] : steps[0]] @@ -2616,7 +2616,7 @@ def _infer_impl(self, start_sympy_data=None): # some models use None for symbolic dim in input, replace it with a string input_dims[i_dim].dim_param = str(self._new_symbolic_dim(i.name, i_dim)) - self.input_symbols_.update([d for d in input_shape if type(d) == str]) # noqa: E721 + self.input_symbols_.update([d for d in input_shape if type(d) is str]) for s in self.input_symbols_: if s in self.suggested_merge_: diff --git a/onnxruntime/python/tools/tensorrt/perf/benchmark.py b/onnxruntime/python/tools/tensorrt/perf/benchmark.py index 8af074f24acc9..4fa5d0c0ea034 100644 --- a/onnxruntime/python/tools/tensorrt/perf/benchmark.py +++ b/onnxruntime/python/tools/tensorrt/perf/benchmark.py @@ -925,8 +925,8 @@ def find_model_path(path): logger.info(target_model_path) if len(target_model_path) > 1: - logger.error("We expect to find only one model in " + path) # noqa: G003 - raise + logger.error("We expect to find only one model in %s", path) + raise RuntimeError return target_model_path[0] @@ -1007,7 +1007,7 @@ def parse_models_info_from_file(root_dir, path, models): models[row["model_name"]] = {} else: logger.error("Model name must be provided in models_info.json") - raise + raise RuntimeError model = models[row["model_name"]] @@ -1018,19 +1018,19 @@ def parse_models_info_from_file(root_dir, path, models): model["working_directory"] = os.path.join(root_working_directory, row["working_directory"]) else: logger.error("Model path must be provided in models_info.json") - raise + raise RuntimeError if "model_path" in row: model["model_path"] = row["model_path"] else: logger.error("Model path must be provided in models_info.json") - raise + raise RuntimeError if "test_data_path" in row: model["test_data_path"] = row["test_data_path"] else: logger.error("Test data path must be provided in models_info.json") - raise + raise RuntimeError if "model_path_fp16" in row: model["model_path_fp16"] = row["model_path_fp16"] diff --git a/onnxruntime/python/tools/tensorrt/perf/perf_utils.py b/onnxruntime/python/tools/tensorrt/perf/perf_utils.py index c447bf9cffe27..0d0f7cc48f361 100644 --- a/onnxruntime/python/tools/tensorrt/perf/perf_utils.py +++ b/onnxruntime/python/tools/tensorrt/perf/perf_utils.py @@ -234,7 +234,7 @@ def calculate_trt_op_percentage(trt_op_map, cuda_op_map): if total_ops == 0: print("Error ...") - raise + raise RuntimeError if len(trt_op_map) == 0: total_cuda_and_cpu_ops = total_ops diff --git a/onnxruntime/python/tools/tensorrt/perf/setup_scripts/setup_onnx_zoo.py b/onnxruntime/python/tools/tensorrt/perf/setup_scripts/setup_onnx_zoo.py index 4f763ad84426d..0532dd7c72c1c 100644 --- a/onnxruntime/python/tools/tensorrt/perf/setup_scripts/setup_onnx_zoo.py +++ b/onnxruntime/python/tools/tensorrt/perf/setup_scripts/setup_onnx_zoo.py @@ -71,7 +71,7 @@ def write_json(models): def main(): links = [] with open("links.txt") as fh: - links = [link.rstrip() for link in fh.readlines()] + links = [link.rstrip() for link in fh] model_list = [] for link in links: diff --git a/onnxruntime/python/tools/transformers/benchmark.py b/onnxruntime/python/tools/transformers/benchmark.py index 5ec2ab4e50799..4800c48744236 100644 --- a/onnxruntime/python/tools/transformers/benchmark.py +++ b/onnxruntime/python/tools/transformers/benchmark.py @@ -802,7 +802,7 @@ def main(): try: os.mkdir(args.cache_dir) except OSError: - logger.error("Creation of the directory %s failed" % args.cache_dir) # noqa: G002 + logger.error("Creation of the directory %s failed", args.cache_dir) enable_torch = "torch" in args.engines enable_torch2 = "torch2" in args.engines diff --git a/onnxruntime/python/tools/transformers/bert_test_data.py b/onnxruntime/python/tools/transformers/bert_test_data.py index aa82e047df328..167fc8697ce06 100644 --- a/onnxruntime/python/tools/transformers/bert_test_data.py +++ b/onnxruntime/python/tools/transformers/bert_test_data.py @@ -168,11 +168,11 @@ def output_test_data(directory: str, inputs: Dict[str, np.ndarray]): try: os.mkdir(directory) except OSError: - print("Creation of the directory %s failed" % directory) + print(f"Creation of the directory {directory} failed") else: - print("Successfully created the directory %s " % directory) + print(f"Successfully created the directory {directory} ") else: - print("Warning: directory %s existed. Files will be overwritten." % directory) + print(f"Warning: directory {directory} existed. Files will be overwritten.") for index, (name, data) in enumerate(inputs.items()): tensor = numpy_helper.from_array(data, name) diff --git a/onnxruntime/python/tools/transformers/fusion_attention.py b/onnxruntime/python/tools/transformers/fusion_attention.py index f48cabd25fc5c..dc2b38f3928ac 100644 --- a/onnxruntime/python/tools/transformers/fusion_attention.py +++ b/onnxruntime/python/tools/transformers/fusion_attention.py @@ -672,7 +672,7 @@ def create_multihead_attention_node( q_matmul, k_matmul, v_matmul, q_add, k_add, v_add, num_heads ) mha_inputs.extend([q_slice.output[0], k_slice.output[0], v_slice.output[0]]) - elif type(k_matmul) == NodeProto and type(v_matmul) == NodeProto: + elif type(k_matmul) is NodeProto and type(v_matmul) is NodeProto: if self.disable_multi_head_attention_bias: mha_inputs.extend([q_add.output[0], k_matmul.output[0], v_add.output[0]]) else: diff --git a/onnxruntime/python/tools/transformers/fusion_utils.py b/onnxruntime/python/tools/transformers/fusion_utils.py index 726c587ff7043..dbd9e828198ca 100644 --- a/onnxruntime/python/tools/transformers/fusion_utils.py +++ b/onnxruntime/python/tools/transformers/fusion_utils.py @@ -159,7 +159,7 @@ def transpose_2d_int8_tensor(tensor: onnx_proto.TensorProto): tensor (TensorProto): transposed tensor """ if not isinstance(tensor, onnx_proto.TensorProto): - raise ValueError("Expected input type is an ONNX TensorProto but got %s" % type(tensor)) + raise ValueError(f"Expected input type is an ONNX TensorProto but got {type(tensor)}") if len(tensor.dims) != 2 or tensor.data_type != onnx_proto.TensorProto.INT8: raise ValueError("Only INT8 2-D tensors can be transposed") diff --git a/onnxruntime/python/tools/transformers/models/bart/utils/export_summarization_edinit.py b/onnxruntime/python/tools/transformers/models/bart/utils/export_summarization_edinit.py index 111520a6e3aeb..8a610fb17671b 100644 --- a/onnxruntime/python/tools/transformers/models/bart/utils/export_summarization_edinit.py +++ b/onnxruntime/python/tools/transformers/models/bart/utils/export_summarization_edinit.py @@ -205,5 +205,5 @@ def export_encoder(args): no_repeat_ngram_size=no_repeat_ngram_size, ) time_cost = time.time() - start_time - print("--- %s seconds ---" % (time_cost)) + print(f"--- {time_cost} seconds ---") print(tokenizer.decode(pred_ids[0], skip_special_tokens=True, clean_up_tokenization_spaces=False)) diff --git a/onnxruntime/python/tools/transformers/models/bart/utils/export_summarization_enc_dec_past.py b/onnxruntime/python/tools/transformers/models/bart/utils/export_summarization_enc_dec_past.py index 29c39730c79ef..afd01ae9d025f 100644 --- a/onnxruntime/python/tools/transformers/models/bart/utils/export_summarization_enc_dec_past.py +++ b/onnxruntime/python/tools/transformers/models/bart/utils/export_summarization_enc_dec_past.py @@ -266,5 +266,5 @@ def export_decoder(args): use_cache=True, ) time_cost = time.time() - start_time - print("--- %s seconds ---" % (time_cost)) + print(f"--- {time_cost} seconds ---") print(tokenizer.decode(pred_ids[0], skip_special_tokens=True, clean_up_tokenization_spaces=False)) diff --git a/onnxruntime/python/tools/transformers/models/bart/utils/onnx_inference.py b/onnxruntime/python/tools/transformers/models/bart/utils/onnx_inference.py index c4c8a2dcba697..7a5cfe42c7efe 100644 --- a/onnxruntime/python/tools/transformers/models/bart/utils/onnx_inference.py +++ b/onnxruntime/python/tools/transformers/models/bart/utils/onnx_inference.py @@ -49,7 +49,7 @@ def run_inference(args): no_repeat_ngram_size=no_repeat_ngram_size, ) time_cost = time.time() - start_time - print("--- %s seconds ---" % (time_cost)) + print(f"--- {time_cost} seconds ---") for j in range(batch_num): for i in range(beam): print( @@ -81,7 +81,7 @@ def run_inference(args): start_time = time.time() out = sess.run(None, ort_inputs) time_cost = time.time() - start_time - print("--- %s seconds ---" % (time_cost)) + print(f"--- {time_cost} seconds ---") for j in range(batch_num): for i in range(beam): print( diff --git a/onnxruntime/python/tools/transformers/models/stable_diffusion/engine_builder.py b/onnxruntime/python/tools/transformers/models/stable_diffusion/engine_builder.py index 26b9a2792e9e1..0b6d325803554 100644 --- a/onnxruntime/python/tools/transformers/models/stable_diffusion/engine_builder.py +++ b/onnxruntime/python/tools/transformers/models/stable_diffusion/engine_builder.py @@ -117,7 +117,7 @@ def get_cached_model_name(self, model_name): model_name = model_name + "_" + "_".join(self.pipeline_info.controlnet) if hash_source: - model_name += "_" + hashlib.md5("\t".join(hash_source).encode("utf-8")).digest().hex()[:8] + model_name += "_" + hashlib.md5("\t".join(hash_source).encode("utf-8")).hexdigest()[:8] # TODO: When we support original VAE, we shall save custom VAE to another directory. diff --git a/onnxruntime/python/tools/transformers/models/stable_diffusion/pipeline_stable_diffusion.py b/onnxruntime/python/tools/transformers/models/stable_diffusion/pipeline_stable_diffusion.py index 1629537dc294f..522cc541c1e57 100644 --- a/onnxruntime/python/tools/transformers/models/stable_diffusion/pipeline_stable_diffusion.py +++ b/onnxruntime/python/tools/transformers/models/stable_diffusion/pipeline_stable_diffusion.py @@ -459,9 +459,9 @@ def denoise_latent( noise_pred_uncond, noise_pred_text = noise_pred.chunk(2) noise_pred = noise_pred_uncond + guidance * (noise_pred_text - noise_pred_uncond) - if type(self.scheduler) == UniPCMultistepScheduler: + if type(self.scheduler) is UniPCMultistepScheduler: latents = self.scheduler.step(noise_pred, timestep, latents, return_dict=False)[0] - elif type(self.scheduler) == LCMScheduler: + elif type(self.scheduler) is LCMScheduler: latents = self.scheduler.step(noise_pred, timestep, latents, generator=self.generator)[0] else: latents = self.scheduler.step(noise_pred, latents, step_offset + step_index, timestep) diff --git a/onnxruntime/test/framework/allocation_planner_test.cc b/onnxruntime/test/framework/allocation_planner_test.cc index 26e40b25930c8..4e9e80b180e9c 100644 --- a/onnxruntime/test/framework/allocation_planner_test.cc +++ b/onnxruntime/test/framework/allocation_planner_test.cc @@ -1883,7 +1883,7 @@ TEST_F(PlannerTest, ParaPlanCreation) { ORT_ENFORCE(main_graph_ort_value_index_map.GetName(per_value_plan.reused_buffer, reused).IsOK()); reuse_pairs.erase(reused); } // if - } // for + } // for ASSERT_TRUE(reuse_pairs.empty()); } diff --git a/onnxruntime/test/onnx/OrtValueList.h b/onnxruntime/test/onnx/OrtValueList.h index 2929cdca428d9..921c1d3872111 100644 --- a/onnxruntime/test/onnx/OrtValueList.h +++ b/onnxruntime/test/onnx/OrtValueList.h @@ -14,7 +14,7 @@ class OrtValueArray { public: ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(OrtValueArray); // n must be non-negative - OrtValueArray(int n) : values(static_cast(n), nullptr){}; + OrtValueArray(int n) : values(static_cast(n), nullptr) {}; ~OrtValueArray() { for (OrtValue* v : values) { if (v != nullptr) Ort::GetApi().ReleaseValue(v); diff --git a/onnxruntime/test/onnx/microbenchmark/activation.cc b/onnxruntime/test/onnx/microbenchmark/activation.cc index 69ee72996365e..df36135bd3017 100644 --- a/onnxruntime/test/onnx/microbenchmark/activation.cc +++ b/onnxruntime/test/onnx/microbenchmark/activation.cc @@ -27,7 +27,7 @@ class Allocs : public IExecutionProvider { std::shared_ptr alloc = std::make_shared(); public: - Allocs() : IExecutionProvider("fake"){}; + Allocs() : IExecutionProvider("fake") {}; AllocatorPtr GetAllocator(OrtMemType) const { return alloc; } diff --git a/onnxruntime/test/optimizer/qdq_transformer_fastmath_test.cc b/onnxruntime/test/optimizer/qdq_transformer_fastmath_test.cc index ec9f78da14a75..ccfa1f1159937 100644 --- a/onnxruntime/test/optimizer/qdq_transformer_fastmath_test.cc +++ b/onnxruntime/test/optimizer/qdq_transformer_fastmath_test.cc @@ -401,7 +401,7 @@ void QDQTransformerGemmTests(bool has_output_q, bool has_bias, bool beta_not_one auto check_binary_op_graph = [&](InferenceSessionWrapper& session) { auto op_to_count = CountOpsInGraph(session.GetGraph()); const QDQOpKeys qdq_keys = GetQDQOpKeys(use_contrib_qdq); - if ((!has_output_q || std::is_same_v)&&(!has_bias || (std::is_same_v && !beta_not_one)) && + if ((!has_output_q || std::is_same_v) && (!has_bias || (std::is_same_v && !beta_not_one)) && (std::is_same_v || std::is_same_v)) { EXPECT_EQ(op_to_count["com.microsoft.QGemm"], 1); EXPECT_EQ(op_to_count["Gemm"], 0); diff --git a/onnxruntime/test/optimizer/qdq_transformer_test.cc b/onnxruntime/test/optimizer/qdq_transformer_test.cc index 1638851daf65a..14c5b60d6e0bd 100644 --- a/onnxruntime/test/optimizer/qdq_transformer_test.cc +++ b/onnxruntime/test/optimizer/qdq_transformer_test.cc @@ -786,7 +786,7 @@ void QDQTransformerGemmTests(bool has_output_q, bool has_bias, bool beta_not_one auto check_binary_op_graph = [&](InferenceSessionWrapper& session) { auto op_to_count = CountOpsInGraph(session.GetGraph()); const QDQOpKeys qdq_keys = GetQDQOpKeys(use_contrib_qdq); - if ((!has_output_q || std::is_same_v)&&(!has_bias || (std::is_same_v && !beta_not_one)) && + if ((!has_output_q || std::is_same_v) && (!has_bias || (std::is_same_v && !beta_not_one)) && (std::is_same_v || std::is_same_v)) { EXPECT_EQ(op_to_count["com.microsoft.QGemm"], 1); EXPECT_EQ(op_to_count["Gemm"], 0); diff --git a/onnxruntime/test/providers/cpu/reduction/reduction_test_cases_generator.py b/onnxruntime/test/providers/cpu/reduction/reduction_test_cases_generator.py index 568a4649f3977..bd06ae9fe881a 100644 --- a/onnxruntime/test/providers/cpu/reduction/reduction_test_cases_generator.py +++ b/onnxruntime/test/providers/cpu/reduction/reduction_test_cases_generator.py @@ -40,13 +40,13 @@ def TestReduction(op, data, axes, keepdims): # noqa: N802 def PrintResult(op, axes, keepdims, res): # noqa: N802 - print(' {"%s",' % op) + print(f' {{"{op}",') print("OpAttributesResult(") print(" // ReductionAttribute") print(" {") print(" // axes_") print("{", end="") - print(*axes, sep=", ", end="") if axes else print("") + print(*axes, sep=", ", end="") if axes else print() print("},") print(" // keep_dims_") print(keepdims, ",") @@ -60,7 +60,7 @@ def PrintResult(op, axes, keepdims, res): # noqa: N802 print(" // expected values") print("{", end="") for i in range(res.size): - print("%5.6ff," % res.item(i)) + print(f"{res.item(i):5.6f}f,") print("})},") @@ -130,7 +130,7 @@ def PrintReenableOptimizations(): # noqa: N802 print("{") for i in range(input_data.size): print( - "%5.6ff," % input_data.item(i), + f"{input_data.item(i):5.6f}f,", ) print("},") print("// input_dims") diff --git a/onnxruntime/test/providers/cpu/tensor/pad_test.cc b/onnxruntime/test/providers/cpu/tensor/pad_test.cc index 5fc8ed417391e..1d9cd15f53327 100644 --- a/onnxruntime/test/providers/cpu/tensor/pad_test.cc +++ b/onnxruntime/test/providers/cpu/tensor/pad_test.cc @@ -66,13 +66,13 @@ static void RunAllOpsetAllDomainPadTests( bool pads_is_initializer; bool value_is_initializer; }; - const std::vector all_test_params { - {false, false}, + const std::vector all_test_params{ + {false, false}, #if (defined(USE_NNAPI) && defined(__ANDROID__)) || (defined(USE_COREML) && defined(__APPLE__)) - // only enable when building NNAPI EP on Android or building CoreML EP for Apple environment - // test runs out of memory in QEMU aarch64 environment, so don't enable otherwise - // TODO try to enable when we move from QEMU to arm64 CI machines - {true, true}, + // only enable when building NNAPI EP on Android or building CoreML EP for Apple environment + // test runs out of memory in QEMU aarch64 environment, so don't enable otherwise + // TODO try to enable when we move from QEMU to arm64 CI machines + {true, true}, #endif }; for (const auto& test_params : all_test_params) { diff --git a/onnxruntime/test/providers/qnn/qnn_basic_test.cc b/onnxruntime/test/providers/qnn/qnn_basic_test.cc index 9489d354755e4..9d19c36dc94b2 100644 --- a/onnxruntime/test/providers/qnn/qnn_basic_test.cc +++ b/onnxruntime/test/providers/qnn/qnn_basic_test.cc @@ -835,14 +835,14 @@ TEST_F(QnnHTPBackendTests, HTPGraphFinalizationOptimizationModes) { // Test that models run with various SoC model values TEST_F(QnnHTPBackendTests, HTPSocModels) { - constexpr std::array soc_models = { "", // No explicit SoC model specified - "0", // "Unknown" + constexpr std::array soc_models = {"", // No explicit SoC model specified + "0", // "Unknown" #if defined(_M_ARM64) - "37" }; // SC8280X + "37"}; // SC8280X #elif defined(__linux__) - "30" }; // SM8350 + "30"}; // SM8350 #else - "" }; + ""}; #endif for (auto soc_model : soc_models) { diff --git a/onnxruntime/test/python/onnx_backend_test_series.py b/onnxruntime/test/python/onnx_backend_test_series.py index 6eebc996fde9c..9b1e87f6ec02e 100644 --- a/onnxruntime/test/python/onnx_backend_test_series.py +++ b/onnxruntime/test/python/onnx_backend_test_series.py @@ -76,7 +76,7 @@ def apply_filters(filters, category): opset_version = f"opset{onnx.defs.onnx_opset_version()}" validated_filters = [] for f in filters[category]: - if type(f) is list: # noqa: E721 + if type(f) is list: opset_regex = f[0] filter_regex = f[1] opset_match = re.match(opset_regex, opset_version) diff --git a/onnxruntime/test/python/transformers/rotary_flash.py b/onnxruntime/test/python/transformers/rotary_flash.py index 42bff9c92b41b..4329b2c1a6057 100644 --- a/onnxruntime/test/python/transformers/rotary_flash.py +++ b/onnxruntime/test/python/transformers/rotary_flash.py @@ -486,9 +486,6 @@ def backward(ctx, dkv): return dkv, None, None, None, None -apply_rotary_emb_kv_ = ApplyRotaryEmbKV.apply - - def apply_rotary_emb_kv_( kv, cos, diff --git a/onnxruntime/test/python/transformers/test_data/bert_squad_tensorflow2.1_keras2onnx_opset11/generate_tiny_keras2onnx_bert_models.py b/onnxruntime/test/python/transformers/test_data/bert_squad_tensorflow2.1_keras2onnx_opset11/generate_tiny_keras2onnx_bert_models.py index 0086ce0d289c7..c1e95f35a633b 100644 --- a/onnxruntime/test/python/transformers/test_data/bert_squad_tensorflow2.1_keras2onnx_opset11/generate_tiny_keras2onnx_bert_models.py +++ b/onnxruntime/test/python/transformers/test_data/bert_squad_tensorflow2.1_keras2onnx_opset11/generate_tiny_keras2onnx_bert_models.py @@ -343,9 +343,9 @@ def generate_test_data( try: os.mkdir(path) except OSError: - print("Creation of the directory %s failed" % path) + print(f"Creation of the directory {path} failed") else: - print("Successfully created the directory %s " % path) + print(f"Successfully created the directory {path} ") if input_tensor_only: return diff --git a/onnxruntime/test/python/transformers/test_data/gpt2_pytorch1.5_opset11/generate_tiny_gpt2_model.py b/onnxruntime/test/python/transformers/test_data/gpt2_pytorch1.5_opset11/generate_tiny_gpt2_model.py index 065783d5812a8..4a4a0bc2c5098 100644 --- a/onnxruntime/test/python/transformers/test_data/gpt2_pytorch1.5_opset11/generate_tiny_gpt2_model.py +++ b/onnxruntime/test/python/transformers/test_data/gpt2_pytorch1.5_opset11/generate_tiny_gpt2_model.py @@ -452,9 +452,9 @@ def generate_test_data( try: os.mkdir(path) except OSError: - print("Creation of the directory %s failed" % path) + print(f"Creation of the directory {path} failed") else: - print("Successfully created the directory %s " % path) + print(f"Successfully created the directory {path} ") sess_options = onnxruntime.SessionOptions() sess_options.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_DISABLE_ALL diff --git a/onnxruntime/test/shared_lib/custom_op_utils.h b/onnxruntime/test/shared_lib/custom_op_utils.h index 8ead4056b1b54..e11540aaa5691 100644 --- a/onnxruntime/test/shared_lib/custom_op_utils.h +++ b/onnxruntime/test/shared_lib/custom_op_utils.h @@ -381,9 +381,9 @@ struct StandaloneCustomOp : Ort::CustomOpBase { @@ -397,9 +397,9 @@ struct MulTopOpFloat : Ort::CustomOpBase { }; struct MulTopKernelInt32 { - MulTopKernelInt32(const OrtKernelInfo*){}; + MulTopKernelInt32(const OrtKernelInfo*) {}; ~MulTopKernelInt32() = default; - void Compute(OrtKernelContext*){}; + void Compute(OrtKernelContext*) {}; }; struct MulTopOpInt32 : Ort::CustomOpBase { @@ -413,9 +413,9 @@ struct MulTopOpInt32 : Ort::CustomOpBase { }; struct MulTopKernelDouble { - MulTopKernelDouble(const OrtKernelInfo*){}; + MulTopKernelDouble(const OrtKernelInfo*) {}; ~MulTopKernelDouble() = default; - void Compute(OrtKernelContext*){}; + void Compute(OrtKernelContext*) {}; }; // MulTopOpDouble and MulTopOpFloat has input count mismatch @@ -430,9 +430,9 @@ struct MulTopOpDouble : Ort::CustomOpBase { }; struct MulTopKernelInt16 { - MulTopKernelInt16(const OrtKernelInfo*){}; + MulTopKernelInt16(const OrtKernelInfo*) {}; ~MulTopKernelInt16() = default; - void Compute(OrtKernelContext*){}; + void Compute(OrtKernelContext*) {}; }; // MulTopOpInt16 and MulTopOpFloat has output count mismatch @@ -448,9 +448,9 @@ struct MulTopOpInt16 : Ort::CustomOpBase { // MulTopKernelFloat16 and MulTopOpFloat has input characteristic mismatch struct MulTopKernelFloat16 { - MulTopKernelFloat16(const OrtKernelInfo*){}; + MulTopKernelFloat16(const OrtKernelInfo*) {}; ~MulTopKernelFloat16() = default; - void Compute(OrtKernelContext*){}; + void Compute(OrtKernelContext*) {}; }; struct MulTopOpFloat16 : Ort::CustomOpBase { diff --git a/onnxruntime/test/testdata/CNTK/gen.py b/onnxruntime/test/testdata/CNTK/gen.py index 37241a46808b5..5a3ca461f471a 100644 --- a/onnxruntime/test/testdata/CNTK/gen.py +++ b/onnxruntime/test/testdata/CNTK/gen.py @@ -48,10 +48,10 @@ def Save(dir, func, feed, outputs): # noqa: N802 if actual_input_name.startswith(cntk_name): cntk_to_actual_names[cntk_name] = actual_input_name - if type(feed) is not dict: # noqa: E721 + if type(feed) is not dict: feed = {func.arguments[0]: feed} - if type(outputs) is not dict: # noqa: E721 + if type(outputs) is not dict: outputs = {func.outputs[0]: outputs} test_data_dir = os.path.join(dir, data_dir) diff --git a/orttraining/orttraining/core/framework/adasum/adasum_mpi.cc b/orttraining/orttraining/core/framework/adasum/adasum_mpi.cc index 805de812cfa65..dc812ee2aec3f 100644 --- a/orttraining/orttraining/core/framework/adasum/adasum_mpi.cc +++ b/orttraining/orttraining/core/framework/adasum/adasum_mpi.cc @@ -35,8 +35,7 @@ void AdasumMPI::InitializeVHDDReductionComms(WorkerGroupType worker_group) { int nearest_power_2 = 1; int log_size; for (nearest_power_2 = 1, log_size = 0; (nearest_power_2 << 1) <= size; - nearest_power_2 = (nearest_power_2 << 1), log_size++) - ; + nearest_power_2 = (nearest_power_2 << 1), log_size++); int shift_val; int level; reduction_comms_ = std::make_unique>(); diff --git a/orttraining/orttraining/core/framework/pipeline.h b/orttraining/orttraining/core/framework/pipeline.h index a93ba1081d7df..79701106c9c1d 100644 --- a/orttraining/orttraining/core/framework/pipeline.h +++ b/orttraining/orttraining/core/framework/pipeline.h @@ -247,7 +247,7 @@ struct PipelineWorkerState { struct PipelineWorkerPool { PipelineWorkerPool() = default; - PipelineWorkerPool(size_t num_workers) : workers(num_workers), worker_states(num_workers){}; + PipelineWorkerPool(size_t num_workers) : workers(num_workers), worker_states(num_workers) {}; void Join(size_t worker_id); void JoinAll(); diff --git a/orttraining/orttraining/core/framework/torch/custom_function_register.h b/orttraining/orttraining/core/framework/torch/custom_function_register.h index 762258a45221e..ddb838ba6475c 100644 --- a/orttraining/orttraining/core/framework/torch/custom_function_register.h +++ b/orttraining/orttraining/core/framework/torch/custom_function_register.h @@ -102,7 +102,7 @@ class OrtTorchFunctionPool final { void UnRegisterFunctions(); private: - OrtTorchFunctionPool(){}; + OrtTorchFunctionPool() {}; ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(OrtTorchFunctionPool); void UnRegisterGlobalFunctions(); diff --git a/orttraining/orttraining/core/framework/torch/gil.h b/orttraining/orttraining/core/framework/torch/gil.h index c928571d2024a..b14b062785eef 100644 --- a/orttraining/orttraining/core/framework/torch/gil.h +++ b/orttraining/orttraining/core/framework/torch/gil.h @@ -13,7 +13,7 @@ // See https://docs.python.org/3/c-api/init.html#non-python-created-threads for details. class GilGuard { public: - GilGuard() : state_(PyGILState_Ensure()){}; + GilGuard() : state_(PyGILState_Ensure()) {}; ~GilGuard() { PyGILState_Release(state_); }; private: diff --git a/orttraining/orttraining/core/framework/torch/torch_proxy.h b/orttraining/orttraining/core/framework/torch/torch_proxy.h index b80acd6c4791a..37766e67ef42f 100644 --- a/orttraining/orttraining/core/framework/torch/torch_proxy.h +++ b/orttraining/orttraining/core/framework/torch/torch_proxy.h @@ -95,8 +95,8 @@ class TorchProxy { std::vector& bw_output_to_input_alias_map); private: - TorchProxy(){}; - ~TorchProxy(){}; + TorchProxy() {}; + ~TorchProxy() {}; ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(TorchProxy); diff --git a/orttraining/orttraining/core/graph/graph_augmenter.h b/orttraining/orttraining/core/graph/graph_augmenter.h index eb146ca0e84f3..c3b6d227f01fd 100644 --- a/orttraining/orttraining/core/graph/graph_augmenter.h +++ b/orttraining/orttraining/core/graph/graph_augmenter.h @@ -33,7 +33,7 @@ struct OpDef { OpDef(const std::string& type, const std::string& domain = kOnnxDomain, const int opset_version = 9) : type(type), domain(domain), - opset_version(opset_version){}; + opset_version(opset_version) {}; std::string type; std::string domain; @@ -52,7 +52,7 @@ struct NodeDef { output_args(output_args), attributes(attributes), name(name), - priority(priority){}; + priority(priority) {}; NodeDef(const std::string& op_type, const std::vector& input_args, @@ -64,7 +64,7 @@ struct NodeDef { output_args(output_args), attributes(attributes), name(name), - priority(priority){}; + priority(priority) {}; NodeDef(const OpDef& op_def, const std::vector& input_args, diff --git a/orttraining/orttraining/core/graph/loss_func/loss_func_common.h b/orttraining/orttraining/core/graph/loss_func/loss_func_common.h index 2b60280e076aa..61bc0a094dac4 100644 --- a/orttraining/orttraining/core/graph/loss_func/loss_func_common.h +++ b/orttraining/orttraining/core/graph/loss_func/loss_func_common.h @@ -21,7 +21,7 @@ struct LossFunctionInfo { struct ILossFunction { virtual GraphAugmenter::GraphDefs operator()(const Graph& graph, const LossFunctionInfo& loss_func_info) = 0; - virtual ~ILossFunction(){}; + virtual ~ILossFunction() {}; }; TypeProto* GetSparseTypeProto(const NodeArg* input_arg, diff --git a/orttraining/orttraining/core/graph/pipeline_transformer.cc b/orttraining/orttraining/core/graph/pipeline_transformer.cc index f989d53aa85d5..3495c3da72e3d 100644 --- a/orttraining/orttraining/core/graph/pipeline_transformer.cc +++ b/orttraining/orttraining/core/graph/pipeline_transformer.cc @@ -887,7 +887,7 @@ struct PipelineStageNodeGroup { // the consumer nodes of a particular initializer can be more than one, so we need a vector to store those // nodes. std::vector nodes; - PipelineStageNodeGroup(const size_t stage, std::vector& node_group) : stage_id(stage), nodes(std::move(node_group)){}; + PipelineStageNodeGroup(const size_t stage, std::vector& node_group) : stage_id(stage), nodes(std::move(node_group)) {}; }; // This function passes through the given initializer across stages specified in node_groups[i].stage_id. diff --git a/orttraining/orttraining/core/optimizer/megatron_transformer.cc b/orttraining/orttraining/core/optimizer/megatron_transformer.cc index 4ebea5cf386cc..25e16304789b6 100644 --- a/orttraining/orttraining/core/optimizer/megatron_transformer.cc +++ b/orttraining/orttraining/core/optimizer/megatron_transformer.cc @@ -21,7 +21,7 @@ struct OpInfo { const size_t output_count = 1) : op_type(op_type), supported_versions(supported_versions), domain(domain), - output_count(output_count){}; + output_count(output_count) {}; std::string op_type; std::initializer_list supported_versions; @@ -53,7 +53,7 @@ const OpInfo where_info = OpInfo("Where", opset_v9); struct NodeInfo { NodeInfo(const std::vector& op_infos, const bool required = true) : op_infos(op_infos), - required(required){}; + required(required) {}; std::vector op_infos; bool required; diff --git a/orttraining/orttraining/core/session/training_session.h b/orttraining/orttraining/core/session/training_session.h index 37b708fb7d1dd..765f88e1c992e 100644 --- a/orttraining/orttraining/core/session/training_session.h +++ b/orttraining/orttraining/core/session/training_session.h @@ -46,7 +46,7 @@ class TrainingSession : public InferenceSession { TrainingSession(const SessionOptions& session_options, const Environment& env) : InferenceSession(session_options, env), is_mixed_precision_enabled_(false) {} - virtual ~TrainingSession(){}; + virtual ~TrainingSession() {}; /** * The training configuration options. @@ -215,11 +215,11 @@ class TrainingSession : public InferenceSession { // If the edge is unique, i.e. only have one consumer node, or all the edges // with the same node_arg_name needs to be cut, specify the node_arg_name // suffices. - CutEdge(std::string edge) : node_arg_name(edge){}; + CutEdge(std::string edge) : node_arg_name(edge) {}; // If the edges with same node_arg_name belongs to different cut, i.e. some of its // consumer node belongs to one partition, and some belongs to another, specify // the consumer node names which you want to perform the cut on. - CutEdge(std::string edge, std::vector nodes) : node_arg_name(edge), consumer_nodes(nodes){}; + CutEdge(std::string edge, std::vector nodes) : node_arg_name(edge), consumer_nodes(nodes) {}; }; // CutInfo is a group of CutEdges that describes a specific cut that composed of splitting those edges. typedef std::vector CutInfo; diff --git a/orttraining/orttraining/lazy_tensor/flags.h b/orttraining/orttraining/lazy_tensor/flags.h index b849f9f9a0a3e..1812466d10346 100644 --- a/orttraining/orttraining/lazy_tensor/flags.h +++ b/orttraining/orttraining/lazy_tensor/flags.h @@ -60,7 +60,7 @@ class DynamicSettings { } private: - DynamicSettings() : onnx_fusion_status_(true){}; + DynamicSettings() : onnx_fusion_status_(true) {}; bool onnx_fusion_status_; }; diff --git a/orttraining/orttraining/models/bert/main.cc b/orttraining/orttraining/models/bert/main.cc index 22cdd9351a206..c4c7a98ba116a 100644 --- a/orttraining/orttraining/models/bert/main.cc +++ b/orttraining/orttraining/models/bert/main.cc @@ -861,8 +861,7 @@ int main(int argc, char* argv[]) { OrtParameters ort_params{}; RETURN_IF_FAIL(ParseArguments(argc, argv, params, ort_params)); bool keep_looping = params.debug_break; - while (keep_looping) - ; + while (keep_looping); // setup logger, be noted: LOGS_DEFAULT must be after logging manager initialization. string default_logger_id{"Default"}; diff --git a/orttraining/orttraining/models/pipeline_poc/main.cc b/orttraining/orttraining/models/pipeline_poc/main.cc index c461e4bbf3600..1b7d6b9ea26f6 100644 --- a/orttraining/orttraining/models/pipeline_poc/main.cc +++ b/orttraining/orttraining/models/pipeline_poc/main.cc @@ -86,36 +86,36 @@ int main(int argc, char* argv[]) { // setup onnxruntime env std::vector overrides = {}; SessionOptions so = { - ExecutionMode::ORT_SEQUENTIAL, // execution_mode - ExecutionOrder::DEFAULT, // execution_order - false, // enable_profiling - ORT_TSTR(""), // optimized_model_filepath - true, // enable_mem_pattern - true, // enable_mem_reuse - true, // enable_cpu_mem_arena - ORT_TSTR("onnxruntime_profile_"), // profile_file_prefix - "", // session_logid - -1, // session_log_severity_level - 0, // session_log_verbosity_level - 5, // max_num_graph_transformation_steps - TransformerLevel::Level1, // graph_optimization_level - {}, // intra_op_param - {}, // inter_op_param - overrides, // free_dimension_overrides - true, // use_per_session_threads - true, // thread_pool_allow_spinning - false, // use_deterministic_compute - {}, // session_configurations - {}, // initializers_to_share_map + ExecutionMode::ORT_SEQUENTIAL, // execution_mode + ExecutionOrder::DEFAULT, // execution_order + false, // enable_profiling + ORT_TSTR(""), // optimized_model_filepath + true, // enable_mem_pattern + true, // enable_mem_reuse + true, // enable_cpu_mem_arena + ORT_TSTR("onnxruntime_profile_"), // profile_file_prefix + "", // session_logid + -1, // session_log_severity_level + 0, // session_log_verbosity_level + 5, // max_num_graph_transformation_steps + TransformerLevel::Level1, // graph_optimization_level + {}, // intra_op_param + {}, // inter_op_param + overrides, // free_dimension_overrides + true, // use_per_session_threads + true, // thread_pool_allow_spinning + false, // use_deterministic_compute + {}, // session_configurations + {}, // initializers_to_share_map #if !defined(ORT_MINIMAL_BUILD) && !defined(DISABLE_EXTERNAL_INITIALIZERS) - {}, // external_initializers - {}, // external_initializer_files + {}, // external_initializers + {}, // external_initializer_files #endif - nullptr, // custom_create_thread_fn - nullptr, // custom_thread_creation_options - nullptr, // custom_join_thread_fn + nullptr, // custom_create_thread_fn + nullptr, // custom_thread_creation_options + nullptr, // custom_join_thread_fn #if !defined(ORT_MINIMAL_BUILD) || defined(ORT_MINIMAL_BUILD_CUSTOM_OPS) - {}, // custom_op_libs + {}, // custom_op_libs #endif }; diff --git a/orttraining/orttraining/models/runner/training_util.h b/orttraining/orttraining/models/runner/training_util.h index 8c76ce7e50dc9..1499b30180f61 100644 --- a/orttraining/orttraining/models/runner/training_util.h +++ b/orttraining/orttraining/models/runner/training_util.h @@ -98,7 +98,7 @@ class RandomDataSet : public DataSet { : DataSet(tensor_names), num_samples_(num_samples), tensor_shapes_(tensor_shapes), - tensor_types_(tensor_types){}; + tensor_types_(tensor_types) {}; virtual ~RandomDataSet() {} @@ -189,7 +189,7 @@ class LossScaler { min_loss_scale_(min_loss_scale), max_loss_scale_(max_loss_scale), loss_scale_(loss_scale), - stable_steps_(0){}; + stable_steps_(0) {}; std::string GetLossScaleInputName() const { return loss_scale_input_name_; } diff --git a/orttraining/orttraining/python/orttraining_pybind_state.cc b/orttraining/orttraining/python/orttraining_pybind_state.cc index a81ea76e807ca..b2392b68ac43e 100644 --- a/orttraining/orttraining/python/orttraining_pybind_state.cc +++ b/orttraining/orttraining/python/orttraining_pybind_state.cc @@ -319,7 +319,7 @@ void addObjectMethodsForTraining(py::module& m) { auto& pool = onnxruntime::language_interop_ops::torch::OrtTorchFunctionPool::GetInstance(); pool.RegisterForwardRunner(function_address); #else - ORT_UNUSED_PARAMETER(obj); + ORT_UNUSED_PARAMETER(obj); #endif }); m.def("register_backward_runner", [](py::object obj) -> void { @@ -328,7 +328,7 @@ void addObjectMethodsForTraining(py::module& m) { auto& pool = onnxruntime::language_interop_ops::torch::OrtTorchFunctionPool::GetInstance(); pool.RegisterBackwardRunner(function_address); #else - ORT_UNUSED_PARAMETER(obj); + ORT_UNUSED_PARAMETER(obj); #endif }); m.def("register_torch_autograd_function", [](std::string function_full_qual_name, py::object obj) -> void { @@ -336,8 +336,8 @@ void addObjectMethodsForTraining(py::module& m) { auto& pool = onnxruntime::language_interop_ops::torch::OrtTorchFunctionPool::GetInstance(); pool.RegisterTorchAutogradFunction(function_full_qual_name, obj.ptr()); #else - ORT_UNUSED_PARAMETER(function_full_qual_name); - ORT_UNUSED_PARAMETER(obj); + ORT_UNUSED_PARAMETER(function_full_qual_name); + ORT_UNUSED_PARAMETER(obj); #endif }); m.def("register_shape_inference_function", [](std::string function_full_qual_name, py::object obj) -> void { @@ -345,8 +345,8 @@ void addObjectMethodsForTraining(py::module& m) { auto& pool = onnxruntime::language_interop_ops::torch::OrtTorchFunctionPool::GetInstance(); pool.RegisterShapeInferenceFunction(function_full_qual_name, obj.ptr()); #else - ORT_UNUSED_PARAMETER(function_full_qual_name); - ORT_UNUSED_PARAMETER(obj); + ORT_UNUSED_PARAMETER(function_full_qual_name); + ORT_UNUSED_PARAMETER(obj); #endif }); m.def("get_shape_inference_function", [](std::string function_full_qual_name) -> py::object { @@ -368,8 +368,8 @@ void addObjectMethodsForTraining(py::module& m) { auto& pool = onnxruntime::language_interop_ops::torch::OrtTorchFunctionPool::GetInstance(); pool.RegisterInputAliasFunction(function_full_qual_name, obj.ptr()); #else - ORT_UNUSED_PARAMETER(function_full_qual_name); - ORT_UNUSED_PARAMETER(obj); + ORT_UNUSED_PARAMETER(function_full_qual_name); + ORT_UNUSED_PARAMETER(obj); #endif }); m.def("register_miscellaneous_const_input", [](py::object obj) -> void { @@ -377,7 +377,7 @@ void addObjectMethodsForTraining(py::module& m) { auto& pool = onnxruntime::language_interop_ops::torch::OrtTorchFunctionPool::GetInstance(); pool.RegisterMiscellaneousConstInput(obj.ptr()); #else - ORT_UNUSED_PARAMETER(obj); + ORT_UNUSED_PARAMETER(obj); #endif }); m.def("unregister_python_functions", []() -> void { @@ -391,14 +391,14 @@ void addObjectMethodsForTraining(py::module& m) { #ifdef ENABLE_TRAINING_TORCH_INTEROP return true; #else - return false; + return false; #endif }); m.def("is_triton_enabled", []() -> bool { #ifdef ENABLE_TRITON return true; #else - return false; + return false; #endif }); #ifdef ENABLE_TRITON @@ -1036,7 +1036,7 @@ void addObjectMethodsForTraining(py::module& m) { #ifdef __linux__ return true; #else - return false; + return false; #endif }); #endif diff --git a/orttraining/orttraining/python/training/ort_triton/kernel/_mm.py b/orttraining/orttraining/python/training/ort_triton/kernel/_mm.py index a3681a13699a0..1a944082fa4ba 100644 --- a/orttraining/orttraining/python/training/ort_triton/kernel/_mm.py +++ b/orttraining/orttraining/python/training/ort_triton/kernel/_mm.py @@ -372,7 +372,7 @@ def _gen_bmm_module( ) -> Tuple[str, ModuleType]: func_name = gen_unique_name("bmm") kwargs = _mm_configs(dtype, m, n, k, trans_a, trans_b, alpha, func_name) - batch = batch_a if batch_a >= batch_b else batch_b + batch = max(batch_a, batch_b) kwargs["stride_aq"] = m * k if batch_a == batch else 0 kwargs["stride_bq"] = k * n if batch_b == batch else 0 kwargs["batch"] = batch diff --git a/orttraining/orttraining/python/training/ortmodule/_utils.py b/orttraining/orttraining/python/training/ortmodule/_utils.py index c299d1c5db4e7..4787cb31a24fd 100644 --- a/orttraining/orttraining/python/training/ortmodule/_utils.py +++ b/orttraining/orttraining/python/training/ortmodule/_utils.py @@ -74,7 +74,7 @@ def _ortvalues_to_torch_tensor( return tuple(C.to_aten_ort_device_tensor(ov) for ov in ortvalues) if not isinstance(ortvalues, C.OrtValueVector): - raise TypeError("ortvalues must be an instance of OrtValueVector not %r." % type(ortvalues)) + raise TypeError(f"ortvalues must be an instance of OrtValueVector not {type(ortvalues)!r}.") res: List[torch.Tensor] = ortvalues.to_dlpacks(_from_dlpack) bool_indices = ortvalues.bool_tensor_indices() diff --git a/orttraining/orttraining/python/training/ortmodule/torch_cpp_extensions/cpu/torch_interop_utils/ctx_pool.h b/orttraining/orttraining/python/training/ortmodule/torch_cpp_extensions/cpu/torch_interop_utils/ctx_pool.h index e7b101d987d7a..b62c2c40c30ee 100644 --- a/orttraining/orttraining/python/training/ortmodule/torch_cpp_extensions/cpu/torch_interop_utils/ctx_pool.h +++ b/orttraining/orttraining/python/training/ortmodule/torch_cpp_extensions/cpu/torch_interop_utils/ctx_pool.h @@ -58,8 +58,8 @@ class PyNodeSharedPointerPool { } private: - PyNodeSharedPointerPool(){}; - ~PyNodeSharedPointerPool(){}; + PyNodeSharedPointerPool() {}; + ~PyNodeSharedPointerPool() {}; PyNodeSharedPointerPool(const PyNodeSharedPointerPool&) = delete; PyNodeSharedPointerPool& operator=(const PyNodeSharedPointerPool&) = delete; diff --git a/orttraining/orttraining/test/distributed/partition_utils.h b/orttraining/orttraining/test/distributed/partition_utils.h index c22d0a3eb2f93..787a001903cce 100644 --- a/orttraining/orttraining/test/distributed/partition_utils.h +++ b/orttraining/orttraining/test/distributed/partition_utils.h @@ -159,7 +159,7 @@ struct PipelineStageNodeGroup { // the consumer nodes of a particular initializer can be more than one, so we need a vector to store those // nodes. std::vector nodes; - PipelineStageNodeGroup(const size_t stage, std::vector& node_group) : stage_id(stage), nodes(std::move(node_group)){}; + PipelineStageNodeGroup(const size_t stage, std::vector& node_group) : stage_id(stage), nodes(std::move(node_group)) {}; }; // This function passes through the given initializer across stages specified in node_groups[i].stage_id. diff --git a/orttraining/orttraining/test/python/orttraining_test_hierarchical_ortmodule.py b/orttraining/orttraining/test/python/orttraining_test_hierarchical_ortmodule.py index 8afbafccb8241..655c9def2c66c 100644 --- a/orttraining/orttraining/test/python/orttraining_test_hierarchical_ortmodule.py +++ b/orttraining/orttraining/test/python/orttraining_test_hierarchical_ortmodule.py @@ -198,7 +198,7 @@ def call_backward(y): y.sum().backward() def call_allclose(y, y_ref): - assert type(y) == type(y_ref) + assert type(y) is type(y_ref) if isinstance(y, Iterable): for ele, ele_ref in zip(y, y_ref): torch.allclose(ele, ele_ref) diff --git a/orttraining/orttraining/test/python/orttraining_test_model_transform.py b/orttraining/orttraining/test/python/orttraining_test_model_transform.py index 095830cd54ab8..6ea81fc6aa089 100644 --- a/orttraining/orttraining/test/python/orttraining_test_model_transform.py +++ b/orttraining/orttraining/test/python/orttraining_test_model_transform.py @@ -77,7 +77,7 @@ def fix_transpose(model): weight = numpy_helper.to_array(t[1]) assert len(weight.shape) == 2 weight = weight.transpose(perm) - new_weight = numpy_helper.from_array(weight, "%s_transposed" % t[1].name) + new_weight = numpy_helper.from_array(weight, f"{t[1].name}_transposed") model.graph.initializer.extend([new_weight]) replace_input_arg(model, node.output[0], new_weight.name) diff --git a/orttraining/orttraining/test/python/orttraining_test_ortmodule_api.py b/orttraining/orttraining/test/python/orttraining_test_ortmodule_api.py index fe59c398d7abb..3615a12705241 100644 --- a/orttraining/orttraining/test/python/orttraining_test_ortmodule_api.py +++ b/orttraining/orttraining/test/python/orttraining_test_ortmodule_api.py @@ -3976,9 +3976,9 @@ def forward(self, input1, bool_argument, int_argument, float_argument): out = self.relu(out) return out - assert type(bool_argument) is bool # noqa: E721 - assert type(int_argument) is int # noqa: E721 - assert type(float_argument) is float # noqa: E721 + assert type(bool_argument) is bool + assert type(int_argument) is int + assert type(float_argument) is float device = "cuda" N, D_in, H, D_out = 32, 784, 500, 10 # noqa: N806 @@ -4014,8 +4014,8 @@ def forward(self, input1, bool_argument): out = self.relu(out) return out - assert type(bool_arguments[0]) is bool # noqa: E721 - assert type(bool_arguments[1]) is bool # noqa: E721 + assert type(bool_arguments[0]) is bool + assert type(bool_arguments[1]) is bool device = "cuda" N, D_in, H, D_out = 32, 784, 500, 10 # noqa: N806 @@ -5501,7 +5501,7 @@ def forward(self, x): return x[: self.dim, :] def random_state_equal(a, b): - assert type(a) == type(b) + assert type(a) is type(b) if isinstance(a, tuple): assert len(a) == len(b) return all([random_state_equal(a_i, b_i) for a_i, b_i in zip(a, b)]) diff --git a/orttraining/orttraining/test/python/orttraining_test_ortmodule_bert_classifier.py b/orttraining/orttraining/test/python/orttraining_test_ortmodule_bert_classifier.py index a1a7d4660f266..41e1e0f5d0d57 100644 --- a/orttraining/orttraining/test/python/orttraining_test_ortmodule_bert_classifier.py +++ b/orttraining/orttraining/test/python/orttraining_test_ortmodule_bert_classifier.py @@ -385,7 +385,7 @@ def main(): # Set log level numeric_level = getattr(logging, args.log_level.upper(), None) if not isinstance(numeric_level, int): - raise ValueError("Invalid log level: %s" % args.log_level) + raise ValueError(f"Invalid log level: {args.log_level}") logging.basicConfig(level=numeric_level) # 2. Dataloader diff --git a/orttraining/orttraining/test/python/orttraining_test_ortmodule_bert_classifier_autocast.py b/orttraining/orttraining/test/python/orttraining_test_ortmodule_bert_classifier_autocast.py index 0d5aba1a1a5c4..801eb58727689 100644 --- a/orttraining/orttraining/test/python/orttraining_test_ortmodule_bert_classifier_autocast.py +++ b/orttraining/orttraining/test/python/orttraining_test_ortmodule_bert_classifier_autocast.py @@ -385,7 +385,7 @@ def main(): # Set log level numeric_level = getattr(logging, args.log_level.upper(), None) if not isinstance(numeric_level, int): - raise ValueError("Invalid log level: %s" % args.log_level) + raise ValueError(f"Invalid log level: {args.log_level}") logging.basicConfig(level=numeric_level) # 2. Dataloader diff --git a/orttraining/orttraining/test/python/orttraining_test_ortmodule_deepspeed_zero_stage_1.py b/orttraining/orttraining/test/python/orttraining_test_ortmodule_deepspeed_zero_stage_1.py index 5b28e9c52b480..5e0a4d38b51d6 100644 --- a/orttraining/orttraining/test/python/orttraining_test_ortmodule_deepspeed_zero_stage_1.py +++ b/orttraining/orttraining/test/python/orttraining_test_ortmodule_deepspeed_zero_stage_1.py @@ -219,7 +219,7 @@ def main(): } log_level = log_level_mapping.get(args.log_level.upper(), None) if not isinstance(log_level, LogLevel): - raise ValueError("Invalid log level: %s" % args.log_level) + raise ValueError(f"Invalid log level: {args.log_level}") debug_options = DebugOptions(log_level=log_level, save_onnx=args.export_onnx_graphs, onnx_prefix="MNIST") model = ORTModule(model, debug_options) diff --git a/orttraining/orttraining/test/python/orttraining_test_ortmodule_onnx_ops.py b/orttraining/orttraining/test/python/orttraining_test_ortmodule_onnx_ops.py index e1def2022d63f..537dcd2ccdb09 100644 --- a/orttraining/orttraining/test/python/orttraining_test_ortmodule_onnx_ops.py +++ b/orttraining/orttraining/test/python/orttraining_test_ortmodule_onnx_ops.py @@ -74,11 +74,11 @@ def run_step(model, x): ) onnx_graph_train = ort_model._torch_module._execution_manager._training_manager._onnx_models.optimized_model if debug: - with open("debug_%s_ortmodule_infer.onnx" % name, "wb") as f: + with open(f"debug_{name}_ortmodule_infer.onnx", "wb") as f: f.write(onnx_graph_inf.SerializeToString()) - with open("debug_%s_ortmodule_train.onnx" % name, "wb") as f: + with open(f"debug_{name}_ortmodule_train.onnx", "wb") as f: f.write(onnx_graph_train.SerializeToString()) - self.assertIn('op_type: "%s"' % name, str(onnx_graph_inf)) + self.assertIn(f'op_type: "{name}"', str(onnx_graph_inf)) for onnx_model in [onnx_graph_inf, onnx_graph_train]: for oimp in onnx_model.opset_import: if oimp.domain == "": @@ -86,10 +86,10 @@ def run_step(model, x): if op_grad_type is not None: if isinstance(op_grad_type, tuple): text = str(onnx_graph_train) - if all(map(lambda op: ('op_type: "%s"' % op) not in text, op_grad_type)): + if all(map(lambda op: (f'op_type: "{op}"') not in text, op_grad_type)): raise AssertionError("Operator {} not found in {}.".format(" or ".join(op_grad_type), text)) else: - self.assertIn('op_type: "%s"' % op_grad_type, str(onnx_graph_train)) + self.assertIn(f'op_type: "{op_grad_type}"', str(onnx_graph_train)) def get_torch_model_name(self, name, device): def from_numpy(v, device=None, requires_grad=False): @@ -137,7 +137,7 @@ def forward(self, input1): return TestGatherElement, "GatherElementsGrad", dict(rtol=1e-04, atol=1e-05) - raise AssertionError("Unexpected name=%r." % name) + raise AssertionError(f"Unexpected name={name!r}.") def test_onnx_ops(self): for name in ["GatherElements", "Softmax"]: diff --git a/orttraining/orttraining/test/python/orttraining_test_ortmodule_poc.py b/orttraining/orttraining/test/python/orttraining_test_ortmodule_poc.py index d6f84d94c2838..5872a69dde876 100644 --- a/orttraining/orttraining/test/python/orttraining_test_ortmodule_poc.py +++ b/orttraining/orttraining/test/python/orttraining_test_ortmodule_poc.py @@ -201,7 +201,7 @@ def main(): # Set log level numeric_level = getattr(logging, args.log_level.upper(), None) if not isinstance(numeric_level, int): - raise ValueError("Invalid log level: %s" % args.log_level) + raise ValueError(f"Invalid log level: {args.log_level}") logging.basicConfig(level=numeric_level) else: print("Training MNIST on vanilla PyTorch....") diff --git a/orttraining/orttraining/test/python/orttraining_test_utilities.py b/orttraining/orttraining/test/python/orttraining_test_utilities.py index 0892bafcdb95d..faa04f327be7f 100644 --- a/orttraining/orttraining/test/python/orttraining_test_utilities.py +++ b/orttraining/orttraining/test/python/orttraining_test_utilities.py @@ -237,7 +237,7 @@ def test_data_flatten_and_unflatten(input_output_map, flag: int): flatten_schema = input_output_map[2] def _recursive_compare(real, expected): - assert type(real) == type(expected) + assert type(real) is type(expected) if isinstance(real, str): assert real == expected elif isinstance(real, abc.Sequence): @@ -258,7 +258,7 @@ def _recursive_compare(real, expected): out, schema = extract_data_and_schema(raw_data) assert all([torch.allclose(o, d) if isinstance(o, torch.Tensor) else o == d for o, d in zip(out, flatten_data)]) if not isinstance(raw_data, torch.Tensor): - assert type(schema) == type(raw_data) + assert type(schema) is type(raw_data) assert str(schema) == str(flatten_schema) diff --git a/orttraining/orttraining/test/training_ops/function_op_test_utils.cc b/orttraining/orttraining/test/training_ops/function_op_test_utils.cc index 9504ba2c1e69a..3daf6db96e31c 100644 --- a/orttraining/orttraining/test/training_ops/function_op_test_utils.cc +++ b/orttraining/orttraining/test/training_ops/function_op_test_utils.cc @@ -72,7 +72,7 @@ void OpFunctionTester::RunFunctionBodyGraphOnCPU(TwoDArray& results) { } } -OpFunctionTester::~OpFunctionTester(){}; +OpFunctionTester::~OpFunctionTester() {}; template std::unique_ptr CreateOpTester(const onnxruntime::training::OpDef& op_def, diff --git a/orttraining/orttraining/training_ops/cpu/torch/torch_custom_function_kernel.h b/orttraining/orttraining/training_ops/cpu/torch/torch_custom_function_kernel.h index f58cd3ecbaeca..850dc6de735f0 100644 --- a/orttraining/orttraining/training_ops/cpu/torch/torch_custom_function_kernel.h +++ b/orttraining/orttraining/training_ops/cpu/torch/torch_custom_function_kernel.h @@ -25,7 +25,7 @@ class PythonOp final : public OpKernel, public PythonOpBase { // Pytorch's torch.autograd.Function.backward(...) wrapper. class PythonOpGrad final : public OpKernel, public PythonOpGradBase { public: - PythonOpGrad(const OpKernelInfo& info) : OpKernel(info), PythonOpGradBase(info){}; + PythonOpGrad(const OpKernelInfo& info) : OpKernel(info), PythonOpGradBase(info) {}; Status Compute(OpKernelContext* context) const override; }; diff --git a/orttraining/orttraining/training_ops/cuda/cuda_training_kernels.cc b/orttraining/orttraining/training_ops/cuda/cuda_training_kernels.cc index bcc9a06f5a250..dac1d7a84b9d9 100644 --- a/orttraining/orttraining/training_ops/cuda/cuda_training_kernels.cc +++ b/orttraining/orttraining/training_ops/cuda/cuda_training_kernels.cc @@ -271,258 +271,258 @@ class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSDomain, 1, Mega Status RegisterCudaTrainingKernels(KernelRegistry& kernel_registry) { static const BuildKernelCreateInfoFn function_table[] = { - BuildKernelCreateInfo, // default entry to avoid the list become empty after ops-reducing - - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - // Adam - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - - // Lamb - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - - BuildKernelCreateInfo, - BuildKernelCreateInfo, - - BuildKernelCreateInfo, - BuildKernelCreateInfo, - - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - // BuildKernelCreateInfo, - BuildKernelCreateInfo, - // BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - - // Adam - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - // Lamb - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - - BuildKernelCreateInfo, - - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, + BuildKernelCreateInfo, // default entry to avoid the list become empty after ops-reducing + + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + // Adam + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + + // Lamb + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + + BuildKernelCreateInfo, + BuildKernelCreateInfo, + + BuildKernelCreateInfo, + BuildKernelCreateInfo, + + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + // BuildKernelCreateInfo, + BuildKernelCreateInfo, + // BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + + // Adam + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + // Lamb + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + + BuildKernelCreateInfo, + + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, // the kernels within the following ifdef are not included in a build with // --enable_training_ops but without --enable_training #ifdef ENABLE_TRAINING // P2P communication operators. #if defined(ORT_USE_NCCL) || defined(USE_MPI) - BuildKernelCreateInfo, - BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, #endif #ifdef USE_MPI - BuildKernelCreateInfo, + BuildKernelCreateInfo, #endif - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, #ifdef ENABLE_TRITON - BuildKernelCreateInfo, + BuildKernelCreateInfo, #endif - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, #ifdef ENABLE_TRAINING_TORCH_INTEROP - BuildKernelCreateInfo, - BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, #endif #ifdef ORT_USE_NCCL - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, #endif #endif }; diff --git a/orttraining/orttraining/training_ops/rocm/rocm_training_kernels.cc b/orttraining/orttraining/training_ops/rocm/rocm_training_kernels.cc index 7824e98fe8a53..c570f727f2a92 100644 --- a/orttraining/orttraining/training_ops/rocm/rocm_training_kernels.cc +++ b/orttraining/orttraining/training_ops/rocm/rocm_training_kernels.cc @@ -222,207 +222,207 @@ class ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, Mega Status RegisterRocmTrainingKernels(KernelRegistry& kernel_registry) { static const BuildKernelCreateInfoFn function_table[] = { - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - // BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - // Adam - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - - // Lamb - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - - BuildKernelCreateInfo, - BuildKernelCreateInfo, - - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - // BuildKernelCreateInfo, - BuildKernelCreateInfo, - // BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - // BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - // BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - // BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - // BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - - BuildKernelCreateInfo, - BuildKernelCreateInfo, - - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + // BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + // Adam + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + + // Lamb + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + + BuildKernelCreateInfo, + BuildKernelCreateInfo, + + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + // BuildKernelCreateInfo, + BuildKernelCreateInfo, + // BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + // BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + // BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + // BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + // BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + + BuildKernelCreateInfo, + BuildKernelCreateInfo, + + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, // P2P communication operators. #if defined(ORT_USE_NCCL) || defined(USE_MPI) - BuildKernelCreateInfo, - BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, #endif #ifdef USE_MPI // BuildKernelCreateInfo, #endif - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, #ifdef ENABLE_TRAINING_TORCH_INTEROP - BuildKernelCreateInfo, - BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, #endif #ifdef ORT_USE_NCCL - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, #endif }; diff --git a/orttraining/tools/scripts/gpt2_model_transform.py b/orttraining/tools/scripts/gpt2_model_transform.py index 294af13fe69b7..50bfda4b407af 100644 --- a/orttraining/tools/scripts/gpt2_model_transform.py +++ b/orttraining/tools/scripts/gpt2_model_transform.py @@ -192,7 +192,7 @@ def fix_transpose(model): weight = numpy_helper.to_array(t[1]) assert len(weight.shape) == 2 weight = weight.transpose(perm) - new_weight = numpy_helper.from_array(weight, "%s_transposed" % t[1].name) + new_weight = numpy_helper.from_array(weight, f"{t[1].name}_transposed") model.graph.initializer.extend([new_weight]) replace_input_arg(model, node.output[0], new_weight.name) diff --git a/orttraining/tools/scripts/model_transform.py b/orttraining/tools/scripts/model_transform.py index 2fb1936ff2184..e87429d10bf88 100644 --- a/orttraining/tools/scripts/model_transform.py +++ b/orttraining/tools/scripts/model_transform.py @@ -227,7 +227,7 @@ def fix_transpose(model): weight = numpy_helper.to_array(t[1]) assert len(weight.shape) == 2 weight = weight.transpose(perm) - new_weight = numpy_helper.from_array(weight, "%s_transposed" % t[1].name) + new_weight = numpy_helper.from_array(weight, f"{t[1].name}_transposed") model.graph.initializer.extend([new_weight]) replace_input_arg(model, node.output[0], new_weight.name) diff --git a/pyproject.toml b/pyproject.toml index 286e4f12721a2..1c3a719fb544a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -84,6 +84,7 @@ ignore = [ "PYI041", # May create confusion "PYI024", # May create confusion "SIM102", # We don't perfer always combining if branches + "SIM103", # Do not collapse if-else "SIM108", # We don't encourage ternary operators "SIM114", # Don't combine if branches for debugability "SIM116", # Don't use dict lookup to replace if-else diff --git a/requirements-lintrunner.txt b/requirements-lintrunner.txt index d19ebe379b50b..7d384f7b1df67 100644 --- a/requirements-lintrunner.txt +++ b/requirements-lintrunner.txt @@ -1,9 +1,9 @@ # This file is auto updated by dependabot -lintrunner-adapters>=0.11.0 +lintrunner-adapters>=0.12.4 # RUFF -ruff==0.3.2 +ruff==0.5.4 # BLACK-ISORT black==24.2.0 -isort==5.12.0 +isort==5.13.2 # CLANGFORMAT -clang-format==17.0.4 +clang-format==18.1.8 diff --git a/tools/ci_build/build.py b/tools/ci_build/build.py index 98d9ba22b7190..587d035541c45 100644 --- a/tools/ci_build/build.py +++ b/tools/ci_build/build.py @@ -65,7 +65,7 @@ def _check_python_version(): def _str_to_bool(s): """Convert string to bool (in argparse context).""" if s.lower() not in ["true", "false"]: - raise ValueError("Need bool; got %r" % s) + raise ValueError(f"Need bool; got {s!r}") return {"true": True, "false": False}[s.lower()] @@ -806,7 +806,7 @@ def resolve_executable_path(command_or_path): def get_linux_distro(): try: with open("/etc/os-release") as f: - dist_info = dict(line.strip().split("=", 1) for line in f.readlines()) + dist_info = dict(line.strip().split("=", 1) for line in f) return dist_info.get("NAME", "").strip('"'), dist_info.get("VERSION", "").strip('"') except (OSError, ValueError): return "", "" @@ -1236,7 +1236,7 @@ def generate_build_tree( cmake_args += ["-Donnxruntime_USE_FULL_PROTOBUF=ON", "-DProtobuf_USE_STATIC_LIBS=ON"] if args.use_tvm and args.llvm_path is not None: - cmake_args += ["-DLLVM_DIR=%s" % args.llvm_path] + cmake_args += [f"-DLLVM_DIR={args.llvm_path}"] if args.use_cuda and not is_windows(): nvml_stub_path = cuda_home + "/lib64/stubs" @@ -1452,7 +1452,7 @@ def generate_build_tree( if args.enable_lazy_tensor: import torch - cmake_args += ["-Donnxruntime_PREBUILT_PYTORCH_PATH=%s" % os.path.dirname(torch.__file__)] + cmake_args += [f"-Donnxruntime_PREBUILT_PYTORCH_PATH={os.path.dirname(torch.__file__)}"] cmake_args += ["-D_GLIBCXX_USE_CXX11_ABI=" + str(int(torch._C._GLIBCXX_USE_CXX11_ABI))] if args.use_azure: @@ -1582,7 +1582,7 @@ def generate_build_tree( else: cuda_compile_flags_str = cuda_compile_flags_str + " " + compile_flag if len(cuda_compile_flags_str) != 0: - cudaflags.append('-Xcompiler="%s"' % cuda_compile_flags_str) + cudaflags.append(f'-Xcompiler="{cuda_compile_flags_str}"') elif is_linux() or is_macOS(): if is_linux(): ldflags = ["-Wl,-Bsymbolic-functions", "-Wl,-z,relro", "-Wl,-z,now", "-Wl,-z,noexecstack"] @@ -1650,16 +1650,16 @@ def generate_build_tree( temp_cmake_args = cmake_args.copy() if cflags is not None and cxxflags is not None and len(cflags) != 0 and len(cxxflags) != 0: temp_cmake_args += [ - "-DCMAKE_C_FLAGS=%s" % (" ".join(cflags)), - "-DCMAKE_CXX_FLAGS=%s" % (" ".join(cxxflags)), + "-DCMAKE_C_FLAGS={}".format(" ".join(cflags)), + "-DCMAKE_CXX_FLAGS={}".format(" ".join(cxxflags)), ] if cudaflags is not None and len(cudaflags) != 0: - temp_cmake_args += ["-DCMAKE_CUDA_FLAGS_INIT=%s" % (" ".join(cudaflags))] + temp_cmake_args += ["-DCMAKE_CUDA_FLAGS_INIT={}".format(" ".join(cudaflags))] if ldflags is not None and len(ldflags) != 0: temp_cmake_args += [ - "-DCMAKE_EXE_LINKER_FLAGS_INIT=%s" % (" ".join(ldflags)), - "-DCMAKE_MODULE_LINKER_FLAGS_INIT=%s" % (" ".join(ldflags)), - "-DCMAKE_SHARED_LINKER_FLAGS_INIT=%s" % (" ".join(ldflags)), + "-DCMAKE_EXE_LINKER_FLAGS_INIT={}".format(" ".join(ldflags)), + "-DCMAKE_MODULE_LINKER_FLAGS_INIT={}".format(" ".join(ldflags)), + "-DCMAKE_SHARED_LINKER_FLAGS_INIT={}".format(" ".join(ldflags)), ] run_subprocess( [ diff --git a/tools/ci_build/gen_def.py b/tools/ci_build/gen_def.py index fe47d8dbe57fe..c4add6f0e8910 100755 --- a/tools/ci_build/gen_def.py +++ b/tools/ci_build/gen_def.py @@ -15,11 +15,11 @@ def parse_arguments(): args = parse_arguments() -print("Generating symbol file for %s" % str(args.config)) +print(f"Generating symbol file for {args.config!s}") with open(args.version_file) as f: VERSION_STRING = f.read().strip() -print("VERSION:%s" % VERSION_STRING) +print(f"VERSION:{VERSION_STRING}") symbols = set() for c in args.config: @@ -41,16 +41,16 @@ def parse_arguments(): elif args.style == "xcode": pass # xcode compile don't has any header. else: - file.write("VERS_%s {\n" % VERSION_STRING) + file.write(f"VERS_{VERSION_STRING} {{\n") file.write(" global:\n") for symbol in symbols: if args.style == "vc": file.write(" %s @%d\n" % (symbol, symbol_index)) elif args.style == "xcode": - file.write("_%s\n" % symbol) + file.write(f"_{symbol}\n") else: - file.write(" %s;\n" % symbol) + file.write(f" {symbol};\n") symbol_index += 1 if args.style == "gcc": diff --git a/tools/ci_build/reduce_op_kernels.py b/tools/ci_build/reduce_op_kernels.py index 6b73b1e063e58..df6bbf7a4058e 100755 --- a/tools/ci_build/reduce_op_kernels.py +++ b/tools/ci_build/reduce_op_kernels.py @@ -256,7 +256,7 @@ def _generate_type_control_overrides(ort_root: Path, build_dir: Path, cpp_lines: inserted = False with open(src) as input, open(target, "w") as output: inside_insertion_block = False - for line in input.readlines(): + for line in input: if "@@insertion_point_begin(allowed_types)@@" in line: inside_insertion_block = True output.write(line) diff --git a/tools/ci_build/replace_urls_in_deps.py b/tools/ci_build/replace_urls_in_deps.py index ac4f515d5482b..37dad358a6feb 100644 --- a/tools/ci_build/replace_urls_in_deps.py +++ b/tools/ci_build/replace_urls_in_deps.py @@ -53,10 +53,10 @@ def main(): csv_file_path = backup_csv_file_path else: # Make a copy before modifying it - print("Making a copy to %s" % str(backup_csv_file_path)) + print(f"Making a copy to {backup_csv_file_path!s}") shutil.copy(csv_file_path, backup_csv_file_path) - print("Reading from %s" % str(csv_file_path)) + print(f"Reading from {csv_file_path!s}") # Read the whole file into memory first with csv_file_path.open("r", encoding="utf-8") as f: depfile_reader = csv.reader(f, delimiter=";") @@ -69,7 +69,7 @@ def main(): deps.append(Dep(row[0], row[1], row[2])) csv_file_path = Path(REPO_DIR) / "cmake" / "deps.txt" - print("Writing to %s" % str(csv_file_path)) + print(f"Writing to {csv_file_path!s}") # Write updated content back with csv_file_path.open("w", newline="", encoding="utf-8") as f: depfile_writer = csv.writer(f, delimiter=";") diff --git a/tools/ci_build/upload_python_package_to_azure_storage.py b/tools/ci_build/upload_python_package_to_azure_storage.py index b7969f02e518e..16ff5d1f71611 100755 --- a/tools/ci_build/upload_python_package_to_azure_storage.py +++ b/tools/ci_build/upload_python_package_to_azure_storage.py @@ -62,7 +62,7 @@ def upload_whl(python_wheel_path, final_storage=False): with open(download_path_to_html, "w") as f: for item in lines: - f.write("%s\n" % item) + f.write(f"{item}\n") else: warnings.warn(f"'{new_line}' exists in {download_path_to_html}. The html file is not updated.") run_subprocess( diff --git a/tools/doc/rename_folders.py b/tools/doc/rename_folders.py index 90d800f2a4498..587755d101ce2 100644 --- a/tools/doc/rename_folders.py +++ b/tools/doc/rename_folders.py @@ -26,7 +26,7 @@ def rename_folder(root): full_into = os.path.join(r, into) if os.path.exists(full_into): raise RuntimeError("%r already exists, previous documentation should be removed.") - print("rename %r" % full_src) + print(f"rename {full_src!r}") os.rename(full_src, full_into) return renamed @@ -51,13 +51,13 @@ def replace_files(root, renamed): for k, v in subs.items(): if k == v: raise ValueError(f"{k!r} == {v!r}") - if ('"%s' % k) in f[0]: - repl.append((f[0], f[0].replace('"%s' % k, '"%s' % v))) - if ("/%s" % k) in f[0]: - repl.append((f[0], f[0].replace("/%s" % k, "/%s" % v))) + if (f'"{k}') in f[0]: + repl.append((f[0], f[0].replace(f'"{k}', f'"{v}'))) + if (f"/{k}") in f[0]: + repl.append((f[0], f[0].replace(f"/{k}", f"/{v}"))) if len(repl) == 0: continue - print("update %r" % full) + print(f"update {full!r}") for k, v in repl: content = content.replace(k, v) with open(full, "w", encoding="utf-8") as f: @@ -71,7 +71,7 @@ def replace_files(root, renamed): root = sys.argv[-1] else: root = "../../build/docs/html" - print("look into %r" % root) + print(f"look into {root!r}") ren = rename_folder(root) if len(ren) == 0: ren = [ diff --git a/tools/nuget/generate_nuspec_for_native_nuget.py b/tools/nuget/generate_nuspec_for_native_nuget.py index 60d1884a9591f..a005bd4c4b89d 100644 --- a/tools/nuget/generate_nuspec_for_native_nuget.py +++ b/tools/nuget/generate_nuspec_for_native_nuget.py @@ -67,7 +67,7 @@ def generate_file_list_for_ep(nuget_artifacts_dir, ep, files_list, include_pdbs, and package_name != "Microsoft.ML.OnnxRuntime.Gpu.Linux" ): files_list.append( - '' % cpu_arch + '' ) for cpu_arch in ["x86_64", "arm64"]: if child.name == get_package_name("osx", cpu_arch, ep, is_training_package): @@ -79,7 +79,7 @@ def generate_file_list_for_ep(nuget_artifacts_dir, ep, files_list, include_pdbs, is_versioned_dylib = re.match(r".*[\.\d+]+\.dylib$", child_file.name) if child_file.is_file() and child_file.suffix == ".dylib" and not is_versioned_dylib: files_list.append( - '' % cpu_arch + '' ) for cpu_arch in ["x64", "aarch64"]: if child.name == get_package_name("linux", cpu_arch, ep, is_training_package): @@ -97,7 +97,7 @@ def generate_file_list_for_ep(nuget_artifacts_dir, ep, files_list, include_pdbs, and package_name != "Microsoft.ML.OnnxRuntime.Gpu.Windows" ): files_list.append( - '' % cpu_arch + '' ) if child.name == "onnxruntime-android" or child.name == "onnxruntime-training-android": diff --git a/tools/python/onnx_test_data_utils.py b/tools/python/onnx_test_data_utils.py index 56485bb78abbd..d50d610a903b7 100644 --- a/tools/python/onnx_test_data_utils.py +++ b/tools/python/onnx_test_data_utils.py @@ -59,7 +59,7 @@ def image_to_numpy(filename, shape, channels_last, add_batch_dim): # target size. w_ratio = new_w / w h_ratio = new_h / h - ratio = w_ratio if w_ratio > h_ratio else h_ratio + ratio = max(h_ratio, w_ratio) interim_w = int(w * ratio) interim_h = int(h * ratio) img = img.resize((interim_w, interim_h), PIL.Image.ANTIALIAS) diff --git a/tools/python/util/mobile_helpers/usability_checker.py b/tools/python/util/mobile_helpers/usability_checker.py index 3d8042ad5412b..a8b5021f1387b 100644 --- a/tools/python/util/mobile_helpers/usability_checker.py +++ b/tools/python/util/mobile_helpers/usability_checker.py @@ -29,7 +29,7 @@ def __init__(self, filename): self._ops_seen = set() with open(filename) as f: - for line in f.readlines(): + for line in f: # we're looking for a markdown table with 2 columns. first is op name. second is caveats # op name is domain:op if line.startswith("|"): diff --git a/tools/python/util/reduced_build_config_parser.py b/tools/python/util/reduced_build_config_parser.py index cb90026808fde..be39562e2d60d 100644 --- a/tools/python/util/reduced_build_config_parser.py +++ b/tools/python/util/reduced_build_config_parser.py @@ -113,7 +113,7 @@ def process_non_op_line(line): return False with open(config_file) as config: - for line in [orig_line.strip() for orig_line in config.readlines()]: + for line in [orig_line.strip() for orig_line in config]: if process_non_op_line(line): continue diff --git a/winml/lib/Api.Image/CpuDetensorizer.h b/winml/lib/Api.Image/CpuDetensorizer.h index e175fbbb4b6a3..04d828097ff3b 100644 --- a/winml/lib/Api.Image/CpuDetensorizer.h +++ b/winml/lib/Api.Image/CpuDetensorizer.h @@ -36,7 +36,8 @@ class CpuDetensorizer { auto nominalRangeConverter = NominalRangeConverter(pixelRange); - if (formatFrom == formatTo && (formatFrom == kImageTensorChannelTypeBGR8 || formatFrom == kImageTensorChannelTypeRGB8)) { + if (formatFrom == formatTo && + (formatFrom == kImageTensorChannelTypeBGR8 || formatFrom == kImageTensorChannelTypeRGB8)) { for (uint32_t i = 0; i < tensorHeight; i++) { BYTE* pPixel = pData; @@ -52,7 +53,8 @@ class CpuDetensorizer { pData += bufferWidth; } - } else if ((formatFrom == kImageTensorChannelTypeRGB8 && formatTo == kImageTensorChannelTypeBGR8) || (formatFrom == kImageTensorChannelTypeBGR8 && formatTo == kImageTensorChannelTypeRGB8)) { + } else if ((formatFrom == kImageTensorChannelTypeRGB8 && formatTo == kImageTensorChannelTypeBGR8) || + (formatFrom == kImageTensorChannelTypeBGR8 && formatTo == kImageTensorChannelTypeRGB8)) { for (uint32_t i = 0; i < tensorHeight; i++) { BYTE* pPixel = pData; @@ -68,7 +70,8 @@ class CpuDetensorizer { pData += bufferWidth; } - } else if (formatFrom == kImageTensorChannelTypeGRAY8 && (formatTo == kImageTensorChannelTypeBGR8 || formatTo == kImageTensorChannelTypeRGB8)) { + } else if (formatFrom == kImageTensorChannelTypeGRAY8 && + (formatTo == kImageTensorChannelTypeBGR8 || formatTo == kImageTensorChannelTypeRGB8)) { // just replicate the gray data across each channel for (uint32_t i = 0; i < end; i += bufferWidth) { for (uint32_t j = i; j < i + bytesPerRow; j += 4) { diff --git a/winml/lib/Api.Image/CpuTensorizer.h b/winml/lib/Api.Image/CpuTensorizer.h index ca5773b28fce2..ed9006470fd0e 100644 --- a/winml/lib/Api.Image/CpuTensorizer.h +++ b/winml/lib/Api.Image/CpuTensorizer.h @@ -39,7 +39,8 @@ class CpuTensorizer { auto nominalRangeConverter = NominalRangeConverter(pixelRange); - if (formatFrom == kImageTensorChannelTypeBGR8 && formatTo == kImageTensorChannelTypeBGR8 || formatFrom == kImageTensorChannelTypeRGB8 && formatTo == kImageTensorChannelTypeRGB8) { + if (formatFrom == kImageTensorChannelTypeBGR8 && formatTo == kImageTensorChannelTypeBGR8 || + formatFrom == kImageTensorChannelTypeRGB8 && formatTo == kImageTensorChannelTypeRGB8) { // Convert BGR8 -> BGR8 or RGB8 -> RGB8 for (uint64_t y = 0; y < yElements; y++) { DeinterleaveRowByteToFloat( @@ -52,7 +53,8 @@ class CpuTensorizer { nominalRangeConverter ); } - } else if (formatFrom == kImageTensorChannelTypeBGR8 && formatTo == kImageTensorChannelTypeRGB8 || formatFrom == kImageTensorChannelTypeRGB8 && formatTo == kImageTensorChannelTypeBGR8) { + } else if (formatFrom == kImageTensorChannelTypeBGR8 && formatTo == kImageTensorChannelTypeRGB8 || + formatFrom == kImageTensorChannelTypeRGB8 && formatTo == kImageTensorChannelTypeBGR8) { // Convert RGB8 -> BGR8 or BGR8 -> RGB8 for (uint32_t y = 0; y < yElements; y++) { DeinterleaveRowByteToFloat( @@ -65,7 +67,8 @@ class CpuTensorizer { nominalRangeConverter ); } - } else if (formatTo == kImageTensorChannelTypeGRAY8 && (formatFrom == kImageTensorChannelTypeBGR8 || formatFrom == kImageTensorChannelTypeRGB8)) { + } else if (formatTo == kImageTensorChannelTypeGRAY8 && + (formatFrom == kImageTensorChannelTypeBGR8 || formatFrom == kImageTensorChannelTypeRGB8)) { // Convert BGR8 -> GRAY8 or RGB8 -> GRAY8 uint32_t blueIncrement = formatFrom == kImageTensorChannelTypeBGR8 ? 0 : 2; uint32_t redIncrement = formatFrom == kImageTensorChannelTypeBGR8 ? 2 : 0; @@ -80,7 +83,8 @@ class CpuTensorizer { pixelInd++; } } - } else if (formatFrom == kImageTensorChannelTypeGRAY8 && (formatTo == kImageTensorChannelTypeBGR8 || formatTo == kImageTensorChannelTypeRGB8)) { + } else if (formatFrom == kImageTensorChannelTypeGRAY8 && + (formatTo == kImageTensorChannelTypeBGR8 || formatTo == kImageTensorChannelTypeRGB8)) { // Convert GRAY8 -> BGR8 or GRAY8 -> RGB8 for (UINT32 i = start; i < end; i += bufferWidth) { for (UINT32 j = i; j < i + bytesPerRow; j += bytesPerPixel) { diff --git a/winml/lib/Api.Image/D3DDeviceCache.cpp b/winml/lib/Api.Image/D3DDeviceCache.cpp index 977f2ba75216a..549a7bba77ef6 100644 --- a/winml/lib/Api.Image/D3DDeviceCache.cpp +++ b/winml/lib/Api.Image/D3DDeviceCache.cpp @@ -349,7 +349,8 @@ ID3D12RootSignature* D3DDeviceCache::GetTensorizeRootSignature() { newRootSignature->SetName(L"Tensorize Rootsignature"); } - if (InterlockedCompareExchangePointer(tensorize_root_signature_.put_void(), newRootSignature.get(), nullptr) == nullptr) { + if (InterlockedCompareExchangePointer(tensorize_root_signature_.put_void(), newRootSignature.get(), nullptr) == + nullptr) { // This thread won the race and just cached the PSO newRootSignature.detach(); } @@ -401,7 +402,8 @@ ID3D12RootSignature* D3DDeviceCache::GetDetensorizeRootSignature() { newRootSignature->SetName(L"Detensorize Rootsignature"); } - if (InterlockedCompareExchangePointer(detensorize_root_signature_.put_void(), newRootSignature.get(), nullptr) == nullptr) { + if (InterlockedCompareExchangePointer(detensorize_root_signature_.put_void(), newRootSignature.get(), nullptr) == + nullptr) { // This thread won the race and just cached the PSO newRootSignature.detach(); } @@ -416,7 +418,8 @@ ID3D12PipelineState* D3DDeviceCache::GetCachedPipelineState( PipelineStateCacheFormat formatTo, PipelineStateCacheOperation operation ) { - if (cached_pipeline_state[static_cast(type)][static_cast(formatFrom)][static_cast(formatTo)][static_cast(operation)] == nullptr) { + if (cached_pipeline_state[static_cast(type)][static_cast(formatFrom)][static_cast(formatTo)] + [static_cast(operation)] == nullptr) { winrt::com_ptr newPSO; if (operation == PipelineStateCacheOperation::kTensorize) { newPSO.attach(CreateTensorizePipelineState(type, formatFrom, formatTo)); @@ -425,12 +428,12 @@ ID3D12PipelineState* D3DDeviceCache::GetCachedPipelineState( } if (InterlockedCompareExchangePointer( - cached_pipeline_state[static_cast(type)][static_cast(formatFrom)][static_cast(formatTo)] - [static_cast(operation)] - .put_void(), - newPSO.get(), - nullptr - ) == nullptr) { + cached_pipeline_state[static_cast(type)][static_cast(formatFrom)][static_cast(formatTo)] + [static_cast(operation)] + .put_void(), + newPSO.get(), + nullptr + ) == nullptr) { // This thread won the race and just cached the PSO newPSO.detach(); } @@ -653,7 +656,8 @@ ID3D12Resource* D3DDeviceCache::GetDetensorizeVertexBuffer(_Out_ UINT* vertexBuf memcpy(pVertexDataBegin, triangleVertices, sizeof(triangleVertices)); newResource->Unmap(0, nullptr); - if (InterlockedCompareExchangePointer(detensorize_vertex_buffer_.put_void(), newResource.get(), nullptr) == nullptr) { + if (InterlockedCompareExchangePointer(detensorize_vertex_buffer_.put_void(), newResource.get(), nullptr) == + nullptr) { // This thread won the race and just cached the PSO newResource.detach(); } diff --git a/winml/lib/Api.Image/EventTimer.h b/winml/lib/Api.Image/EventTimer.h index 3620a7a2c0ee1..590675646b70d 100644 --- a/winml/lib/Api.Image/EventTimer.h +++ b/winml/lib/Api.Image/EventTimer.h @@ -4,7 +4,9 @@ class EventTimer { public: bool Start() { auto now = std::chrono::high_resolution_clock::now(); - if (!_started || std::chrono::duration_cast(now - _startTime).count() > _kDurationBetweenSendingEvents) { + if (!_started || + std::chrono::duration_cast(now - _startTime).count() > + _kDurationBetweenSendingEvents) { _started = true; _startTime = std::chrono::high_resolution_clock::now(); return true; diff --git a/winml/lib/Api.Image/ImageConversionHelpers.cpp b/winml/lib/Api.Image/ImageConversionHelpers.cpp index 11434c5fffb8e..441413bface28 100644 --- a/winml/lib/Api.Image/ImageConversionHelpers.cpp +++ b/winml/lib/Api.Image/ImageConversionHelpers.cpp @@ -69,7 +69,8 @@ void _winmli::ConvertVideoFrameToVideoFrame( wgdx::Direct3D11::IDirect3DSurface spInputDirect3DSurface = inputVideoFrame.Direct3DSurface(); // only one of softwarebitmap or direct3Dsurface should be non-null - if ((spInputSoftwareBitmap == nullptr && spInputDirect3DSurface == nullptr) || (spInputSoftwareBitmap != nullptr && spInputDirect3DSurface != nullptr)) { + if ((spInputSoftwareBitmap == nullptr && spInputDirect3DSurface == nullptr) || + (spInputSoftwareBitmap != nullptr && spInputDirect3DSurface != nullptr)) { WINML_THROW_HR(E_INVALIDARG); } @@ -133,11 +134,9 @@ bool _winmli::NeedsVideoFrameConversion( if (FAILED((hr = GetVideoFrameInfo(inputVideoFrame, format, width, height, luid)))) { bNeedConversion = true; - } else if (((int)inputBounds.Width != outputWidth) || - (inputBounds.X != 0) || - ((int)inputBounds.Height != outputHeight) || - (inputBounds.Y != 0) || - (inputVideoFrame == nullptr)) // Check crop + } else if (((int)inputBounds.Width != outputWidth) || (inputBounds.X != 0) || + ((int)inputBounds.Height != outputHeight) || (inputBounds.Y != 0) || + (inputVideoFrame == nullptr)) // Check crop { bNeedConversion = true; } else if (luid.HighPart != outputLuid.HighPart || luid.LowPart != outputLuid.LowPart) { diff --git a/winml/lib/Api.Image/ImageConverter.cpp b/winml/lib/Api.Image/ImageConverter.cpp index bb97f0ec7ff34..84b6f5a3a4c5c 100644 --- a/winml/lib/Api.Image/ImageConverter.cpp +++ b/winml/lib/Api.Image/ImageConverter.cpp @@ -50,7 +50,8 @@ ComPtr ImageConverter::FetchOrCreateFenceOnDevice( ComPtr fence; UINT comPtrSize = static_cast(sizeof(fence.GetAddressOf())); - if (FAILED(pD3D11Device->GetPrivateData(device_cache.GetFenceGuid(), &comPtrSize, fence.GetAddressOf())) || fence.Get() == nullptr) { + if (FAILED(pD3D11Device->GetPrivateData(device_cache.GetFenceGuid(), &comPtrSize, fence.GetAddressOf())) || + fence.Get() == nullptr) { // There's no fence on the device, so create a new one ComPtr spD3D11Device5; WINML_THROW_IF_FAILED(pD3D11Device->QueryInterface(IID_PPV_ARGS(&spD3D11Device5))); diff --git a/winml/lib/Api.Image/TensorToVideoFrameConverter.cpp b/winml/lib/Api.Image/TensorToVideoFrameConverter.cpp index 76a5623c5b4a5..456931d21e0a8 100644 --- a/winml/lib/Api.Image/TensorToVideoFrameConverter.cpp +++ b/winml/lib/Api.Image/TensorToVideoFrameConverter.cpp @@ -196,8 +196,9 @@ void TensorToVideoFrameConverter::DX12TensorToVideoFrame( UINT comPtrSize = static_cast(sizeof(spSharedD3D11Texture.GetAddressOf())); UINT handleSize = static_cast(sizeof(sharedHandle)); - if ((FAILED(spVideoFrameTexture->GetPrivateData( - _d3d11TextureGUID, &comPtrSize, spSharedD3D11Texture.GetAddressOf())) || + if ((FAILED( + spVideoFrameTexture->GetPrivateData(_d3d11TextureGUID, &comPtrSize, spSharedD3D11Texture.GetAddressOf()) + ) || !spSharedD3D11Texture.Get()) || (FAILED(spVideoFrameTexture->GetPrivateData(_handleGUID, &handleSize, &sharedHandle)) || sharedHandle != shared_handle_)) { @@ -365,7 +366,8 @@ void TensorToVideoFrameConverter::SoftwareTensorToVideoFrame( wgdx::Direct3D11::IDirect3DSurface spOutputSurface = pDestVideoFrame.Direct3DSurface(); // only one of softwarebitmap or direct3Dsurface should be non-null - if ((spOutputSoftwareBitmap == nullptr && spOutputSurface == nullptr) || (spOutputSoftwareBitmap != nullptr && spOutputSurface != nullptr)) { + if ((spOutputSoftwareBitmap == nullptr && spOutputSurface == nullptr) || + (spOutputSoftwareBitmap != nullptr && spOutputSurface != nullptr)) { WINML_THROW_HR(E_INVALIDARG); } if (spOutputSoftwareBitmap) { @@ -381,7 +383,10 @@ void TensorToVideoFrameConverter::SoftwareTensorToVideoFrame( if (_winmli::NeedsVideoFrameConversion( pDestVideoFrame, {}, {0, 0, (UINT32)tensorWidth, (UINT32)tensorHeight}, tensorWidth, tensorHeight )) { - if (converted_video_frame_ == nullptr || _winmli::NeedsVideoFrameConversion(converted_video_frame_, {}, {0, 0, (UINT32)tensorWidth, (UINT32)tensorHeight}, tensorWidth, tensorHeight)) { + if (converted_video_frame_ == nullptr || + _winmli::NeedsVideoFrameConversion( + converted_video_frame_, {}, {0, 0, (UINT32)tensorWidth, (UINT32)tensorHeight}, tensorWidth, tensorHeight + )) { converted_video_frame_ = wm::VideoFrame::CreateWithSoftwareBitmap( wgi::SoftwareBitmap(wgi::BitmapPixelFormat::Bgra8, tensorWidth, tensorHeight) ); diff --git a/winml/lib/Api.Image/VideoFrameToTensorConverter.cpp b/winml/lib/Api.Image/VideoFrameToTensorConverter.cpp index 0a763c77c94f4..a9b507ae4e16f 100644 --- a/winml/lib/Api.Image/VideoFrameToTensorConverter.cpp +++ b/winml/lib/Api.Image/VideoFrameToTensorConverter.cpp @@ -138,14 +138,19 @@ void VideoFrameToTensorConverter::VideoFrameToSoftwareTensor( wgdx::Direct3D11::IDirect3DSurface spInputSurface = inputVideoFrame.Direct3DSurface(); // only one of softwarebitmap or direct3Dsurface should be non-null - if ((spInputSoftwareBitmap == nullptr && spInputSurface == nullptr) || (spInputSoftwareBitmap != nullptr && spInputSurface != nullptr)) { + if ((spInputSoftwareBitmap == nullptr && spInputSurface == nullptr) || + (spInputSoftwareBitmap != nullptr && spInputSurface != nullptr)) { WINML_THROW_IF_FAILED(E_INVALIDARG); } UINT32 tensorHeight = static_cast(tensorDesc.sizes[2]); UINT32 tensorWidth = static_cast(tensorDesc.sizes[3]); - if (spInputSurface || _winmli::NeedsVideoFrameConversion(inputVideoFrame, {}, inputBounds, tensorWidth, tensorHeight)) { - if (converted_video_frame_ == nullptr || _winmli::NeedsVideoFrameConversion(converted_video_frame_, {}, {0, 0, (UINT32)tensorWidth, (UINT32)tensorHeight}, tensorWidth, tensorHeight)) { + if (spInputSurface || + _winmli::NeedsVideoFrameConversion(inputVideoFrame, {}, inputBounds, tensorWidth, tensorHeight)) { + if (converted_video_frame_ == nullptr || + _winmli::NeedsVideoFrameConversion( + converted_video_frame_, {}, {0, 0, (UINT32)tensorWidth, (UINT32)tensorHeight}, tensorWidth, tensorHeight + )) { converted_video_frame_ = wm::VideoFrame::CreateWithSoftwareBitmap( wgi::SoftwareBitmap(wgi::BitmapPixelFormat::Bgra8, tensorWidth, tensorHeight) ); @@ -236,8 +241,8 @@ void VideoFrameToTensorConverter::VideoFrameToDX12Tensor( // TODO: Scale during the tensorization phase instead of using the video frame pipeline when the input bounds are not the same size as the tensor if (!_winmli::DirectXPixelFormatSupported(spDirect3DSurface.Description().Format) || - static_cast(inputBounds.Width) != tensorDesc.sizes[3] || - static_cast(inputBounds.Height) != tensorDesc.sizes[2]) { + static_cast(inputBounds.Width) != tensorDesc.sizes[3] || + static_cast(inputBounds.Height) != tensorDesc.sizes[2]) { // Force the VideoFrame to not do a conversion if the format is supported since we do it during the tensorization anyway wgdx::DirectXPixelFormat newFormat = _winmli::DirectXPixelFormatSupported(spDirect3DSurface.Description().Format) ? spDirect3DSurface.Description().Format @@ -269,7 +274,7 @@ void VideoFrameToTensorConverter::VideoFrameToDX12Tensor( D3D11_cached_texture_->GetDesc(&cachedTextureDesc); if (cachedTextureDesc.Width != scaledBounds.Width || cachedTextureDesc.Height != scaledBounds.Height || - cachedTextureDesc.Format != videoFrameTextureDesc.Format) { + cachedTextureDesc.Format != videoFrameTextureDesc.Format) { // The dimensions or format don't match, so we need to re-create our texture WINML_THROW_IF_FAILED( pDeviceCache->GetD3D11Device()->CreateTexture2D(&videoFrameTextureDesc, nullptr, &D3D11_cached_texture_) @@ -289,12 +294,12 @@ void VideoFrameToTensorConverter::VideoFrameToDX12Tensor( UINT comPtrSize = static_cast(sizeof(spSharedD3D11Texture.GetAddressOf())); UINT handleSize = static_cast(sizeof(sharedHandle)); - if ((FAILED(spVideoFrameTexture->GetPrivateData( - d3d11_texture_GUID_, &comPtrSize, spSharedD3D11Texture.GetAddressOf() - )) || - !spSharedD3D11Texture.Get()) || - (FAILED(spVideoFrameTexture->GetPrivateData(handle_GUID_, &handleSize, &sharedHandle)) || - sharedHandle != shared_handle_)) { + if ((FAILED( + spVideoFrameTexture->GetPrivateData(d3d11_texture_GUID_, &comPtrSize, spSharedD3D11Texture.GetAddressOf()) + ) || + !spSharedD3D11Texture.Get()) || + (FAILED(spVideoFrameTexture->GetPrivateData(handle_GUID_, &handleSize, &sharedHandle)) || + sharedHandle != shared_handle_)) { // Create a new shared texture that we cache on the video frame texture WINML_THROW_IF_FAILED(spTextureDevice->CreateTexture2D(&videoFrameTextureDesc, nullptr, &spSharedD3D11Texture)); @@ -423,9 +428,9 @@ void VideoFrameToTensorConverter::ConvertDX12TextureToGPUTensor( WINML_THROW_IF_FAILED(ULongLongMult(ullNumElementsTensor, uiTensorElementSize, &ullTensorSize)); if (outputDesc.Width < ullTensorSize || outputDesc.Height != 1 || - outputDesc.Dimension != D3D12_RESOURCE_DIMENSION_BUFFER || - !(outputDesc.Flags & D3D12_RESOURCE_FLAG_ALLOW_UNORDERED_ACCESS) || - outputHeapProperties.Type != D3D12_HEAP_TYPE_DEFAULT) { + outputDesc.Dimension != D3D12_RESOURCE_DIMENSION_BUFFER || + !(outputDesc.Flags & D3D12_RESOURCE_FLAG_ALLOW_UNORDERED_ACCESS) || + outputHeapProperties.Type != D3D12_HEAP_TYPE_DEFAULT) { WINML_THROW_IF_FAILED(E_INVALIDARG); } } @@ -565,7 +570,8 @@ void VideoFrameToTensorConverter::ConvertSoftwareBitmapToGPUTensor( wgi::BitmapBounds scaledBounds = inputBounds; // TODO: Scale during the tensorization phase instead of using the video frame pipeline when the input bounds are not the same size as the tensor - if (static_cast(inputBounds.Width) != tensorDesc.sizes[3] || static_cast(inputBounds.Height) != tensorDesc.sizes[2]) { + if (static_cast(inputBounds.Width) != tensorDesc.sizes[3] || + static_cast(inputBounds.Height) != tensorDesc.sizes[2]) { scaledBounds = {0, 0, static_cast(tensorDesc.sizes[3]), static_cast(tensorDesc.sizes[2])}; // Force the VideoFrame to not do a conversion if the format is supported since we do it during the tensorization anyway diff --git a/winml/lib/Api.Image/inc/ConverterResourceStore.h b/winml/lib/Api.Image/inc/ConverterResourceStore.h index ffb413e0b92f3..24406c9fdaaef 100644 --- a/winml/lib/Api.Image/inc/ConverterResourceStore.h +++ b/winml/lib/Api.Image/inc/ConverterResourceStore.h @@ -25,7 +25,7 @@ struct ConverterResourceDescription { // 2) the resources are on different devices // 3) the resources have different pixel formats if (desc.width != width || desc.height != height || desc.luid.HighPart != luid.HighPart || - desc.luid.LowPart != luid.LowPart || desc.pixel_format != pixel_format) { + desc.luid.LowPart != luid.LowPart || desc.pixel_format != pixel_format) { return false; } diff --git a/winml/lib/Api/FeatureValues.h b/winml/lib/Api/FeatureValues.h index a330b244d40fc..fe6429f0a421b 100644 --- a/winml/lib/Api/FeatureValues.h +++ b/winml/lib/Api/FeatureValues.h @@ -29,37 +29,37 @@ #include "ImageFeatureValue.h" // CREATE_TENSOR is used by data tensor types to implement common functionality -#define CREATE_TENSOR(type, element_type, element_view_type) \ - namespace WINMLP { \ - struct type : public _winml::TensorBase< \ - element_type, \ - element_view_type, \ - type, \ - I##type, \ - type##T> { \ - using Base = TensorBase< \ - element_type, \ - element_view_type, \ - type, \ - I##type, \ - type##T>; \ - \ - type() = default; \ - \ - type(wfc::IIterable const& shape) : Base(shape){}; \ - \ - type(std::vector const& shape) : Base(shape){}; \ - \ - type(std::vector const& shape, ID3D12Resource* pResource) : Base(shape, pResource){}; \ - }; \ - } \ - namespace WINML::factory_implementation { \ - struct type : type##T { \ - STDMETHOD(CreateFromD3D12Resource) \ - (ID3D12Resource * value, __int64* shape, int shapeSize, IUnknown** result) { \ - return winmlp::type::CreateFromD3D12Resource(value, shape, shapeSize, result); \ - } \ - }; \ +#define CREATE_TENSOR(type, element_type, element_view_type) \ + namespace WINMLP { \ + struct type : public _winml::TensorBase< \ + element_type, \ + element_view_type, \ + type, \ + I##type, \ + type##T> { \ + using Base = TensorBase< \ + element_type, \ + element_view_type, \ + type, \ + I##type, \ + type##T>; \ + \ + type() = default; \ + \ + type(wfc::IIterable const& shape) : Base(shape) {}; \ + \ + type(std::vector const& shape) : Base(shape) {}; \ + \ + type(std::vector const& shape, ID3D12Resource* pResource) : Base(shape, pResource) {}; \ + }; \ + } \ + namespace WINML::factory_implementation { \ + struct type : type##T { \ + STDMETHOD(CreateFromD3D12Resource) \ + (ID3D12Resource * value, __int64* shape, int shapeSize, IUnknown** result) { \ + return winmlp::type::CreateFromD3D12Resource(value, shape, shapeSize, result); \ + } \ + }; \ } CREATE_TENSOR(TensorBoolean, bool, bool) @@ -86,11 +86,11 @@ CREATE_TENSOR(TensorString, std::string, winrt::hstring) #pragma warning(pop) // CREATE_MAP is used by map types to implement common functionality -#define CREATE_MAP(type, key_type, value_type) \ - namespace WINMLP { \ - struct type : public _winml::MapBase { \ - type(wfc::IMap const& data) : MapBase(data){}; \ - }; \ +#define CREATE_MAP(type, key_type, value_type) \ + namespace WINMLP { \ + struct type : public _winml::MapBase { \ + type(wfc::IMap const& data) : MapBase(data) {}; \ + }; \ } CREATE_MAP(MapInt64BitToInt64Bit, int64_t, int64_t) @@ -103,11 +103,11 @@ CREATE_MAP(MapStringToDouble, hstring, double) CREATE_MAP(MapStringToString, hstring, hstring) // CREATE_SEQUENCE is used by sequence types to implement common functionality -#define CREATE_SEQUENCE(type, element_type, raw_type) \ - namespace WINMLP { \ - struct type : public _winml::SequenceBase { \ - type(wfc::IIterable const& data) : SequenceBase(data){}; \ - }; \ +#define CREATE_SEQUENCE(type, element_type, raw_type) \ + namespace WINMLP { \ + struct type : public _winml::SequenceBase { \ + type(wfc::IIterable const& data) : SequenceBase(data) {}; \ + }; \ } using AbiMapStringFloat = wfc::IMap; diff --git a/winml/lib/Api/ImageFeatureValue.cpp b/winml/lib/Api/ImageFeatureValue.cpp index 8628c578e5004..65f2e56180e19 100644 --- a/winml/lib/Api/ImageFeatureValue.cpp +++ b/winml/lib/Api/ImageFeatureValue.cpp @@ -221,7 +221,9 @@ static _winml::ImageTensorDescription CreateImageTensorDescriptor( THROW_HR(E_NOTIMPL); } - if (pixelRange != winml::LearningModelPixelRange::ZeroTo255 && pixelRange != winml::LearningModelPixelRange::ZeroToOne && pixelRange != winml::LearningModelPixelRange::MinusOneToOne) { + if (pixelRange != winml::LearningModelPixelRange::ZeroTo255 && + pixelRange != winml::LearningModelPixelRange::ZeroToOne && + pixelRange != winml::LearningModelPixelRange::MinusOneToOne) { THROW_HR(E_NOTIMPL); } @@ -331,12 +333,11 @@ std::optional ImageFeatureValue::GetIn // The the widths and heights of input data must be the same. Or the // tensorDescriptor cannot describ the shape of the inputs. if (spImageDescriptor->Width() == MAXUINT32 && - !(std::adjacent_find(m_widths.begin(), m_widths.end(), std::not_equal_to()) == m_widths.end())) { + !(std::adjacent_find(m_widths.begin(), m_widths.end(), std::not_equal_to()) == m_widths.end())) { THROW_HR(E_INVALIDARG); } if (spImageDescriptor->Height() == MAXUINT32 && - !(std::adjacent_find(m_heights.begin(), m_heights.end(), std::not_equal_to()) == m_heights.end() - )) { + !(std::adjacent_find(m_heights.begin(), m_heights.end(), std::not_equal_to()) == m_heights.end())) { THROW_HR(E_INVALIDARG); } descriptorWidth = (spImageDescriptor->Width() == MAXUINT32) ? m_widths[0] : spImageDescriptor->Width(); @@ -354,12 +355,11 @@ std::optional ImageFeatureValue::GetIn return {}; } if (-1 == shape.GetAt(3) && - !(std::adjacent_find(m_widths.begin(), m_widths.end(), std::not_equal_to()) == m_widths.end())) { + !(std::adjacent_find(m_widths.begin(), m_widths.end(), std::not_equal_to()) == m_widths.end())) { THROW_HR(E_INVALIDARG); } if (-1 == shape.GetAt(2) && - !(std::adjacent_find(m_heights.begin(), m_heights.end(), std::not_equal_to()) == m_heights.end() - )) { + !(std::adjacent_find(m_heights.begin(), m_heights.end(), std::not_equal_to()) == m_heights.end())) { THROW_HR(E_INVALIDARG); } descriptorWidth = (-1 == shape.GetAt(3)) ? m_widths[0] : static_cast(shape.GetAt(3)); diff --git a/winml/lib/Api/LearningModel.cpp b/winml/lib/Api/LearningModel.cpp index 6d7c8317ce5f9..8de14a5dfce10 100644 --- a/winml/lib/Api/LearningModel.cpp +++ b/winml/lib/Api/LearningModel.cpp @@ -64,7 +64,7 @@ LearningModel::LearningModel(const hstring& path, const winml::ILearningModelOpe WINML_THROW_IF_FAILED(CreateOnnxruntimeEngineFactory(engine_factory_.put())); - wil::unique_handle file_handle { + wil::unique_handle file_handle{ #if WINVER >= _WIN32_WINNT_WIN8 CreateFile2(path.c_str(), GENERIC_READ, FILE_SHARE_READ, OPEN_EXISTING, NULL) }; diff --git a/winml/lib/Api/LearningModelSession.cpp b/winml/lib/Api/LearningModelSession.cpp index 011a4a718f82a..57bafda57fe54 100644 --- a/winml/lib/Api/LearningModelSession.cpp +++ b/winml/lib/Api/LearningModelSession.cpp @@ -21,8 +21,8 @@ static const auto c_enable_debug_output = L"EnableDebugOutput"; namespace guid_details { // This GUID is to be used for delimiting ML-related categories of capturable work. // {D113B493-BBA2-4993-8608-D706A73B91CE} -struct __declspec(uuid("D113B493-BBA2-4993-8608-D706A73B91CE")) __declspec(novtable -) WINML_PIX_EVAL_CAPTURABLE_WORK_GUID {}; +struct __declspec(uuid("D113B493-BBA2-4993-8608-D706A73B91CE")) +__declspec(novtable) WINML_PIX_EVAL_CAPTURABLE_WORK_GUID {}; } // namespace guid_details static const GUID WINML_PIX_EVAL_CAPTURABLE_WORK_GUID = __uuidof(guid_details::WINML_PIX_EVAL_CAPTURABLE_WORK_GUID); diff --git a/winml/lib/Api/NumericData.cpp b/winml/lib/Api/NumericData.cpp index ae5f9155d425c..1e3ba5438c10a 100644 --- a/winml/lib/Api/NumericData.cpp +++ b/winml/lib/Api/NumericData.cpp @@ -68,9 +68,7 @@ gsl::span numeric_data::buffer(bool should_sync_buffer) { } auto span = combined_buffer(); if (should_sync_buffer) { - _winml::LoadSpanFromDisjointBuffers( - buffers_.size(), [this](size_t i) { return buffer_at(i); }, span - ); + _winml::LoadSpanFromDisjointBuffers(buffers_.size(), [this](size_t i) { return buffer_at(i); }, span); } return span; @@ -80,9 +78,7 @@ bool numeric_data::flush() { auto should_flush = buffers_.size() != 1; if (should_flush) { auto span = combined_buffer(); - _winml::StoreSpanIntoDisjointBuffers( - buffers_.size(), [this](size_t i) { return buffer_at(i); }, span - ); + _winml::StoreSpanIntoDisjointBuffers(buffers_.size(), [this](size_t i) { return buffer_at(i); }, span); } return should_flush; } @@ -97,9 +93,7 @@ void numeric_data::set(size_t data_size, const byte* data) { ); gsl::span span(const_cast(data), data_size); - _winml::StoreSpanIntoDisjointBuffers( - buffers_.size(), [this](size_t i) { return buffer_at(i); }, span - ); + _winml::StoreSpanIntoDisjointBuffers(buffers_.size(), [this](size_t i) { return buffer_at(i); }, span); } static gsl::span get_span_from_ibuffer(wss::IBuffer buffer) { diff --git a/winml/lib/Api/impl/FeatureCompatibility.h b/winml/lib/Api/impl/FeatureCompatibility.h index 3fff488be23f7..1b124097f3f80 100644 --- a/winml/lib/Api/impl/FeatureCompatibility.h +++ b/winml/lib/Api/impl/FeatureCompatibility.h @@ -375,11 +375,11 @@ static void (*FeatureKindCompatibilityMatrix[4][4])( ) = { // Tensor, Sequence, Map, Image /* Tensor */ {verify, not_compatible, not_compatible, verify}, - /* Sequence */ + /* Sequence */ {not_compatible, verify, not_compatible, not_compatible}, - /* Map */ + /* Map */ {not_compatible, not_compatible, verify, not_compatible}, - /* Image */ + /* Image */ {verify, not_compatible, not_compatible, verify} }; } // namespace compatibility_details diff --git a/winml/lib/Common/CommonDeviceHelpers.cpp b/winml/lib/Common/CommonDeviceHelpers.cpp index 01615005a8947..b4ada6c498212 100644 --- a/winml/lib/Common/CommonDeviceHelpers.cpp +++ b/winml/lib/Common/CommonDeviceHelpers.cpp @@ -65,8 +65,10 @@ HRESULT GetDXCoreAdapterMetadata( RETURN_IF_FAILED(spFactory->GetAdapterByLuid(device.GetAdapterLuid(), IID_PPV_ARGS(spAdapter.put()))); if (spAdapter->IsAttributeSupported(DXCORE_ADAPTER_ATTRIBUTE_D3D12_CORE_COMPUTE) && - (!(spAdapter->IsAttributeSupported(DXCORE_ADAPTER_ATTRIBUTE_D3D12_GRAPHICS) || - spAdapter->IsAttributeSupported(DXCORE_ADAPTER_ATTRIBUTE_D3D11_GRAPHICS)))) { + (!( + spAdapter->IsAttributeSupported(DXCORE_ADAPTER_ATTRIBUTE_D3D12_GRAPHICS) || + spAdapter->IsAttributeSupported(DXCORE_ADAPTER_ATTRIBUTE_D3D11_GRAPHICS) + ))) { isMcdmAdapter = true; } else { isMcdmAdapter = false; diff --git a/winml/test/api/raw/buffer_backed_random_access_stream_reference.h b/winml/test/api/raw/buffer_backed_random_access_stream_reference.h index e9539c188e45a..6f492bf8340c9 100644 --- a/winml/test/api/raw/buffer_backed_random_access_stream_reference.h +++ b/winml/test/api/raw/buffer_backed_random_access_stream_reference.h @@ -347,8 +347,9 @@ struct BufferBackedRandomAccessStreamReference } virtual HRESULT STDMETHODCALLTYPE OpenReadAsync( - /* [retval, out] */ __RPC__deref_out_opt - __FIAsyncOperation_1_Windows__CStorage__CStreams__CIRandomAccessStreamWithContentType** operation + /* [retval, out] */ + __RPC__deref_out_opt __FIAsyncOperation_1_Windows__CStorage__CStreams__CIRandomAccessStreamWithContentType** + operation ) override { auto open_read_async = Microsoft::WRL::Make(); open_read_async.CopyTo(operation); diff --git a/winml/test/api/raw/winml_microsoft.h b/winml/test/api/raw/winml_microsoft.h index 92094188793d5..60527b238d8cd 100644 --- a/winml/test/api/raw/winml_microsoft.h +++ b/winml/test/api/raw/winml_microsoft.h @@ -141,8 +141,8 @@ struct TensorRuntimeClassID { static const wchar_t* RuntimeClass_ID; }; -__declspec(selectany -) const wchar_t* TensorRuntimeClassID::RuntimeClass_ID = RuntimeClass_Microsoft_AI_MachineLearning_TensorFloat; +__declspec(selectany) const wchar_t* TensorRuntimeClassID::RuntimeClass_ID = + RuntimeClass_Microsoft_AI_MachineLearning_TensorFloat; __declspec(selectany) const wchar_t* TensorRuntimeClassID::RuntimeClass_ID = RuntimeClass_Microsoft_AI_MachineLearning_TensorFloat16Bit; __declspec(selectany) const wchar_t* TensorRuntimeClassID::RuntimeClass_ID = @@ -161,10 +161,10 @@ __declspec(selectany) const wchar_t* TensorRuntimeClassID::RuntimeClas RuntimeClass_Microsoft_AI_MachineLearning_TensorUInt64Bit; __declspec(selectany) const wchar_t* TensorRuntimeClassID::RuntimeClass_ID = RuntimeClass_Microsoft_AI_MachineLearning_TensorInt64Bit; -__declspec(selectany -) const wchar_t* TensorRuntimeClassID::RuntimeClass_ID = RuntimeClass_Microsoft_AI_MachineLearning_TensorBoolean; -__declspec(selectany -) const wchar_t* TensorRuntimeClassID::RuntimeClass_ID = RuntimeClass_Microsoft_AI_MachineLearning_TensorDouble; +__declspec(selectany) const wchar_t* TensorRuntimeClassID::RuntimeClass_ID = + RuntimeClass_Microsoft_AI_MachineLearning_TensorBoolean; +__declspec(selectany) const wchar_t* TensorRuntimeClassID::RuntimeClass_ID = + RuntimeClass_Microsoft_AI_MachineLearning_TensorDouble; template struct TensorFactory {}; @@ -319,30 +319,30 @@ struct TensorFactoryIID { static const GUID IID; }; -__declspec(selectany -) const GUID TensorFactoryIID::IID = ABI::Microsoft::AI::MachineLearning::IID_ITensorFloatStatics; -__declspec(selectany -) const GUID TensorFactoryIID::IID = ABI::Microsoft::AI::MachineLearning::IID_ITensorFloat16BitStatics; -__declspec(selectany -) const GUID TensorFactoryIID::IID = ABI::Microsoft::AI::MachineLearning::IID_ITensorInt8BitStatics; -__declspec(selectany -) const GUID TensorFactoryIID::IID = ABI::Microsoft::AI::MachineLearning::IID_ITensorUInt8BitStatics; -__declspec(selectany -) const GUID TensorFactoryIID::IID = ABI::Microsoft::AI::MachineLearning::IID_ITensorUInt16BitStatics; -__declspec(selectany -) const GUID TensorFactoryIID::IID = ABI::Microsoft::AI::MachineLearning::IID_ITensorInt16BitStatics; -__declspec(selectany -) const GUID TensorFactoryIID::IID = ABI::Microsoft::AI::MachineLearning::IID_ITensorUInt32BitStatics; -__declspec(selectany -) const GUID TensorFactoryIID::IID = ABI::Microsoft::AI::MachineLearning::IID_ITensorInt32BitStatics; -__declspec(selectany -) const GUID TensorFactoryIID::IID = ABI::Microsoft::AI::MachineLearning::IID_ITensorUInt64BitStatics; -__declspec(selectany -) const GUID TensorFactoryIID::IID = ABI::Microsoft::AI::MachineLearning::IID_ITensorInt64BitStatics; -__declspec(selectany -) const GUID TensorFactoryIID::IID = ABI::Microsoft::AI::MachineLearning::IID_ITensorBooleanStatics; -__declspec(selectany -) const GUID TensorFactoryIID::IID = ABI::Microsoft::AI::MachineLearning::IID_ITensorDoubleStatics; +__declspec(selectany) const GUID TensorFactoryIID::IID = + ABI::Microsoft::AI::MachineLearning::IID_ITensorFloatStatics; +__declspec(selectany) const GUID TensorFactoryIID::IID = + ABI::Microsoft::AI::MachineLearning::IID_ITensorFloat16BitStatics; +__declspec(selectany) const GUID TensorFactoryIID::IID = + ABI::Microsoft::AI::MachineLearning::IID_ITensorInt8BitStatics; +__declspec(selectany) const GUID TensorFactoryIID::IID = + ABI::Microsoft::AI::MachineLearning::IID_ITensorUInt8BitStatics; +__declspec(selectany) const GUID TensorFactoryIID::IID = + ABI::Microsoft::AI::MachineLearning::IID_ITensorUInt16BitStatics; +__declspec(selectany) const GUID TensorFactoryIID::IID = + ABI::Microsoft::AI::MachineLearning::IID_ITensorInt16BitStatics; +__declspec(selectany) const GUID TensorFactoryIID::IID = + ABI::Microsoft::AI::MachineLearning::IID_ITensorUInt32BitStatics; +__declspec(selectany) const GUID TensorFactoryIID::IID = + ABI::Microsoft::AI::MachineLearning::IID_ITensorInt32BitStatics; +__declspec(selectany) const GUID TensorFactoryIID::IID = + ABI::Microsoft::AI::MachineLearning::IID_ITensorUInt64BitStatics; +__declspec(selectany) const GUID TensorFactoryIID::IID = + ABI::Microsoft::AI::MachineLearning::IID_ITensorInt64BitStatics; +__declspec(selectany) const GUID TensorFactoryIID::IID = + ABI::Microsoft::AI::MachineLearning::IID_ITensorBooleanStatics; +__declspec(selectany) const GUID TensorFactoryIID::IID = + ABI::Microsoft::AI::MachineLearning::IID_ITensorDoubleStatics; template struct TensorFactory2IID {}; @@ -395,30 +395,30 @@ struct TensorFactory2IID { static const GUID IID; }; -__declspec(selectany -) const GUID TensorFactory2IID::IID = ABI::Microsoft::AI::MachineLearning::IID_ITensorFloatStatics2; -__declspec(selectany -) const GUID TensorFactory2IID::IID = ABI::Microsoft::AI::MachineLearning::IID_ITensorFloat16BitStatics2; -__declspec(selectany -) const GUID TensorFactory2IID::IID = ABI::Microsoft::AI::MachineLearning::IID_ITensorInt8BitStatics2; -__declspec(selectany -) const GUID TensorFactory2IID::IID = ABI::Microsoft::AI::MachineLearning::IID_ITensorUInt8BitStatics2; -__declspec(selectany -) const GUID TensorFactory2IID::IID = ABI::Microsoft::AI::MachineLearning::IID_ITensorUInt16BitStatics2; -__declspec(selectany -) const GUID TensorFactory2IID::IID = ABI::Microsoft::AI::MachineLearning::IID_ITensorInt16BitStatics2; -__declspec(selectany -) const GUID TensorFactory2IID::IID = ABI::Microsoft::AI::MachineLearning::IID_ITensorUInt32BitStatics2; -__declspec(selectany -) const GUID TensorFactory2IID::IID = ABI::Microsoft::AI::MachineLearning::IID_ITensorInt32BitStatics2; -__declspec(selectany -) const GUID TensorFactory2IID::IID = ABI::Microsoft::AI::MachineLearning::IID_ITensorUInt64BitStatics2; -__declspec(selectany -) const GUID TensorFactory2IID::IID = ABI::Microsoft::AI::MachineLearning::IID_ITensorInt64BitStatics2; -__declspec(selectany -) const GUID TensorFactory2IID::IID = ABI::Microsoft::AI::MachineLearning::IID_ITensorBooleanStatics2; -__declspec(selectany -) const GUID TensorFactory2IID::IID = ABI::Microsoft::AI::MachineLearning::IID_ITensorDoubleStatics2; +__declspec(selectany) const GUID TensorFactory2IID::IID = + ABI::Microsoft::AI::MachineLearning::IID_ITensorFloatStatics2; +__declspec(selectany) const GUID TensorFactory2IID::IID = + ABI::Microsoft::AI::MachineLearning::IID_ITensorFloat16BitStatics2; +__declspec(selectany) const GUID TensorFactory2IID::IID = + ABI::Microsoft::AI::MachineLearning::IID_ITensorInt8BitStatics2; +__declspec(selectany) const GUID TensorFactory2IID::IID = + ABI::Microsoft::AI::MachineLearning::IID_ITensorUInt8BitStatics2; +__declspec(selectany) const GUID TensorFactory2IID::IID = + ABI::Microsoft::AI::MachineLearning::IID_ITensorUInt16BitStatics2; +__declspec(selectany) const GUID TensorFactory2IID::IID = + ABI::Microsoft::AI::MachineLearning::IID_ITensorInt16BitStatics2; +__declspec(selectany) const GUID TensorFactory2IID::IID = + ABI::Microsoft::AI::MachineLearning::IID_ITensorUInt32BitStatics2; +__declspec(selectany) const GUID TensorFactory2IID::IID = + ABI::Microsoft::AI::MachineLearning::IID_ITensorInt32BitStatics2; +__declspec(selectany) const GUID TensorFactory2IID::IID = + ABI::Microsoft::AI::MachineLearning::IID_ITensorUInt64BitStatics2; +__declspec(selectany) const GUID TensorFactory2IID::IID = + ABI::Microsoft::AI::MachineLearning::IID_ITensorInt64BitStatics2; +__declspec(selectany) const GUID TensorFactory2IID::IID = + ABI::Microsoft::AI::MachineLearning::IID_ITensorBooleanStatics2; +__declspec(selectany) const GUID TensorFactory2IID::IID = + ABI::Microsoft::AI::MachineLearning::IID_ITensorDoubleStatics2; inline HRESULT GetActivationFactory(const wchar_t* p_class_id, const IID& iid, void** factory) noexcept { // Fallback to OS binary if the redistributable is not present! diff --git a/winml/test/api/raw/winml_windows.h b/winml/test/api/raw/winml_windows.h index 944daff6dd10a..8e72743f3d98b 100644 --- a/winml/test/api/raw/winml_windows.h +++ b/winml/test/api/raw/winml_windows.h @@ -141,12 +141,12 @@ struct TensorRuntimeClassID { static const wchar_t* RuntimeClass_ID; }; -__declspec(selectany -) const wchar_t* TensorRuntimeClassID::RuntimeClass_ID = RuntimeClass_Windows_AI_MachineLearning_TensorFloat; +__declspec(selectany) const wchar_t* TensorRuntimeClassID::RuntimeClass_ID = + RuntimeClass_Windows_AI_MachineLearning_TensorFloat; __declspec(selectany) const wchar_t* TensorRuntimeClassID::RuntimeClass_ID = RuntimeClass_Windows_AI_MachineLearning_TensorFloat16Bit; -__declspec(selectany -) const wchar_t* TensorRuntimeClassID::RuntimeClass_ID = RuntimeClass_Windows_AI_MachineLearning_TensorInt8Bit; +__declspec(selectany) const wchar_t* TensorRuntimeClassID::RuntimeClass_ID = + RuntimeClass_Windows_AI_MachineLearning_TensorInt8Bit; __declspec(selectany) const wchar_t* TensorRuntimeClassID::RuntimeClass_ID = RuntimeClass_Windows_AI_MachineLearning_TensorUInt8Bit; __declspec(selectany) const wchar_t* TensorRuntimeClassID::RuntimeClass_ID = @@ -161,10 +161,10 @@ __declspec(selectany) const wchar_t* TensorRuntimeClassID::RuntimeClas RuntimeClass_Windows_AI_MachineLearning_TensorUInt64Bit; __declspec(selectany) const wchar_t* TensorRuntimeClassID::RuntimeClass_ID = RuntimeClass_Windows_AI_MachineLearning_TensorInt64Bit; -__declspec(selectany -) const wchar_t* TensorRuntimeClassID::RuntimeClass_ID = RuntimeClass_Windows_AI_MachineLearning_TensorBoolean; -__declspec(selectany -) const wchar_t* TensorRuntimeClassID::RuntimeClass_ID = RuntimeClass_Windows_AI_MachineLearning_TensorDouble; +__declspec(selectany) const wchar_t* TensorRuntimeClassID::RuntimeClass_ID = + RuntimeClass_Windows_AI_MachineLearning_TensorBoolean; +__declspec(selectany) const wchar_t* TensorRuntimeClassID::RuntimeClass_ID = + RuntimeClass_Windows_AI_MachineLearning_TensorDouble; template struct TensorFactory {}; @@ -319,30 +319,30 @@ struct TensorFactoryIID { static const GUID IID; }; -__declspec(selectany -) const GUID TensorFactoryIID::IID = ABI::Windows::AI::MachineLearning::IID_ITensorFloatStatics; -__declspec(selectany -) const GUID TensorFactoryIID::IID = ABI::Windows::AI::MachineLearning::IID_ITensorFloat16BitStatics; -__declspec(selectany -) const GUID TensorFactoryIID::IID = ABI::Windows::AI::MachineLearning::IID_ITensorInt8BitStatics; -__declspec(selectany -) const GUID TensorFactoryIID::IID = ABI::Windows::AI::MachineLearning::IID_ITensorUInt8BitStatics; -__declspec(selectany -) const GUID TensorFactoryIID::IID = ABI::Windows::AI::MachineLearning::IID_ITensorUInt16BitStatics; -__declspec(selectany -) const GUID TensorFactoryIID::IID = ABI::Windows::AI::MachineLearning::IID_ITensorInt16BitStatics; -__declspec(selectany -) const GUID TensorFactoryIID::IID = ABI::Windows::AI::MachineLearning::IID_ITensorUInt32BitStatics; -__declspec(selectany -) const GUID TensorFactoryIID::IID = ABI::Windows::AI::MachineLearning::IID_ITensorInt32BitStatics; -__declspec(selectany -) const GUID TensorFactoryIID::IID = ABI::Windows::AI::MachineLearning::IID_ITensorUInt64BitStatics; -__declspec(selectany -) const GUID TensorFactoryIID::IID = ABI::Windows::AI::MachineLearning::IID_ITensorInt64BitStatics; -__declspec(selectany -) const GUID TensorFactoryIID::IID = ABI::Windows::AI::MachineLearning::IID_ITensorBooleanStatics; -__declspec(selectany -) const GUID TensorFactoryIID::IID = ABI::Windows::AI::MachineLearning::IID_ITensorDoubleStatics; +__declspec(selectany) const GUID TensorFactoryIID::IID = + ABI::Windows::AI::MachineLearning::IID_ITensorFloatStatics; +__declspec(selectany) const GUID TensorFactoryIID::IID = + ABI::Windows::AI::MachineLearning::IID_ITensorFloat16BitStatics; +__declspec(selectany) const GUID TensorFactoryIID::IID = + ABI::Windows::AI::MachineLearning::IID_ITensorInt8BitStatics; +__declspec(selectany) const GUID TensorFactoryIID::IID = + ABI::Windows::AI::MachineLearning::IID_ITensorUInt8BitStatics; +__declspec(selectany) const GUID TensorFactoryIID::IID = + ABI::Windows::AI::MachineLearning::IID_ITensorUInt16BitStatics; +__declspec(selectany) const GUID TensorFactoryIID::IID = + ABI::Windows::AI::MachineLearning::IID_ITensorInt16BitStatics; +__declspec(selectany) const GUID TensorFactoryIID::IID = + ABI::Windows::AI::MachineLearning::IID_ITensorUInt32BitStatics; +__declspec(selectany) const GUID TensorFactoryIID::IID = + ABI::Windows::AI::MachineLearning::IID_ITensorInt32BitStatics; +__declspec(selectany) const GUID TensorFactoryIID::IID = + ABI::Windows::AI::MachineLearning::IID_ITensorUInt64BitStatics; +__declspec(selectany) const GUID TensorFactoryIID::IID = + ABI::Windows::AI::MachineLearning::IID_ITensorInt64BitStatics; +__declspec(selectany) const GUID TensorFactoryIID::IID = + ABI::Windows::AI::MachineLearning::IID_ITensorBooleanStatics; +__declspec(selectany) const GUID TensorFactoryIID::IID = + ABI::Windows::AI::MachineLearning::IID_ITensorDoubleStatics; template struct TensorFactory2IID {}; @@ -395,30 +395,30 @@ struct TensorFactory2IID { static const GUID IID; }; -__declspec(selectany -) const GUID TensorFactory2IID::IID = ABI::Windows::AI::MachineLearning::IID_ITensorFloatStatics2; -__declspec(selectany -) const GUID TensorFactory2IID::IID = ABI::Windows::AI::MachineLearning::IID_ITensorFloat16BitStatics2; -__declspec(selectany -) const GUID TensorFactory2IID::IID = ABI::Windows::AI::MachineLearning::IID_ITensorInt8BitStatics2; -__declspec(selectany -) const GUID TensorFactory2IID::IID = ABI::Windows::AI::MachineLearning::IID_ITensorUInt8BitStatics2; -__declspec(selectany -) const GUID TensorFactory2IID::IID = ABI::Windows::AI::MachineLearning::IID_ITensorUInt16BitStatics2; -__declspec(selectany -) const GUID TensorFactory2IID::IID = ABI::Windows::AI::MachineLearning::IID_ITensorInt16BitStatics2; -__declspec(selectany -) const GUID TensorFactory2IID::IID = ABI::Windows::AI::MachineLearning::IID_ITensorUInt32BitStatics2; -__declspec(selectany -) const GUID TensorFactory2IID::IID = ABI::Windows::AI::MachineLearning::IID_ITensorInt32BitStatics2; -__declspec(selectany -) const GUID TensorFactory2IID::IID = ABI::Windows::AI::MachineLearning::IID_ITensorUInt64BitStatics2; -__declspec(selectany -) const GUID TensorFactory2IID::IID = ABI::Windows::AI::MachineLearning::IID_ITensorInt64BitStatics2; -__declspec(selectany -) const GUID TensorFactory2IID::IID = ABI::Windows::AI::MachineLearning::IID_ITensorBooleanStatics2; -__declspec(selectany -) const GUID TensorFactory2IID::IID = ABI::Windows::AI::MachineLearning::IID_ITensorDoubleStatics2; +__declspec(selectany) const GUID TensorFactory2IID::IID = + ABI::Windows::AI::MachineLearning::IID_ITensorFloatStatics2; +__declspec(selectany) const GUID TensorFactory2IID::IID = + ABI::Windows::AI::MachineLearning::IID_ITensorFloat16BitStatics2; +__declspec(selectany) const GUID TensorFactory2IID::IID = + ABI::Windows::AI::MachineLearning::IID_ITensorInt8BitStatics2; +__declspec(selectany) const GUID TensorFactory2IID::IID = + ABI::Windows::AI::MachineLearning::IID_ITensorUInt8BitStatics2; +__declspec(selectany) const GUID TensorFactory2IID::IID = + ABI::Windows::AI::MachineLearning::IID_ITensorUInt16BitStatics2; +__declspec(selectany) const GUID TensorFactory2IID::IID = + ABI::Windows::AI::MachineLearning::IID_ITensorInt16BitStatics2; +__declspec(selectany) const GUID TensorFactory2IID::IID = + ABI::Windows::AI::MachineLearning::IID_ITensorUInt32BitStatics2; +__declspec(selectany) const GUID TensorFactory2IID::IID = + ABI::Windows::AI::MachineLearning::IID_ITensorInt32BitStatics2; +__declspec(selectany) const GUID TensorFactory2IID::IID = + ABI::Windows::AI::MachineLearning::IID_ITensorUInt64BitStatics2; +__declspec(selectany) const GUID TensorFactory2IID::IID = + ABI::Windows::AI::MachineLearning::IID_ITensorInt64BitStatics2; +__declspec(selectany) const GUID TensorFactory2IID::IID = + ABI::Windows::AI::MachineLearning::IID_ITensorBooleanStatics2; +__declspec(selectany) const GUID TensorFactory2IID::IID = + ABI::Windows::AI::MachineLearning::IID_ITensorDoubleStatics2; inline HRESULT GetActivationFactory(const wchar_t* p_class_id, const IID& iid, void** factory) noexcept { // Fallback to OS binary if the redistributable is not present! diff --git a/winml/test/image/imagetests.cpp b/winml/test/image/imagetests.cpp index b408c0315f94a..04717c75aa150 100644 --- a/winml/test/image/imagetests.cpp +++ b/winml/test/image/imagetests.cpp @@ -212,13 +212,16 @@ class ImageTests : public ::testing::Test { const std::wstring& model_file_name, const std::wstring& image_file_name, const InputImageSource input_image_source ) { // Case that the tensor's shape doesn't match model's shape should be skipped - if ((L"1080.jpg" == image_file_name || L"kitten_224.png" == image_file_name) && (InputImageSource::FromGPUResource == input_image_source || InputImageSource::FromCPUResource == input_image_source)) { + if ((L"1080.jpg" == image_file_name || L"kitten_224.png" == image_file_name) && + (InputImageSource::FromGPUResource == input_image_source || + InputImageSource::FromCPUResource == input_image_source)) { return true; } // Case that the images's shape doesn't match model's shape which expects free dimension should be skipped. // Because the fns-candy is not real model that can handle free dimensional input - if ((L"1080.jpg" == image_file_name || L"kitten_224.png" == image_file_name) && L"fns-candy_Bgr8_freeDimInput.onnx" == model_file_name) { + if ((L"1080.jpg" == image_file_name || L"kitten_224.png" == image_file_name) && + L"fns-candy_Bgr8_freeDimInput.onnx" == model_file_name) { return true; } @@ -385,7 +388,8 @@ TEST_P(ImageTest, ImageTest) { GTEST_SKIP() << "This test is disabled"; } - if (LearningModelDeviceKind::Cpu != param.device_kind || InputImageSource::FromGPUResource == param.input_image_source) { + if (LearningModelDeviceKind::Cpu != param.device_kind || + InputImageSource::FromGPUResource == param.input_image_source) { GPUTEST; } @@ -482,13 +486,14 @@ TEST_P(BatchTest, BatchSupport) { if (param.use_session_options) { optimized_batch_size = param.use_session_options; } - if (VideoFrameSource::FromDirect3DSurface == param.video_frame_source && LearningModelDeviceKind::Cpu == param.device_kind) { + if (VideoFrameSource::FromDirect3DSurface == param.video_frame_source && + LearningModelDeviceKind::Cpu == param.device_kind) { return; } if (LearningModelDeviceKind::Cpu != param.device_kind || - VideoFrameSource::FromDirect3DSurface == param.video_frame_source || - VideoFrameSource::FromDirect3DSurface == param.output_video_frame_source || - VideoFrameSource::FromUnsupportedD3DSurface == param.output_video_frame_source) { + VideoFrameSource::FromDirect3DSurface == param.video_frame_source || + VideoFrameSource::FromDirect3DSurface == param.output_video_frame_source || + VideoFrameSource::FromUnsupportedD3DSurface == param.output_video_frame_source) { GPUTEST; } @@ -556,7 +561,7 @@ TEST_P(BatchTest, BatchSupport) { for (int i = 0; i < param.batch_size; ++i) { std::wstring bm_image_path = FileHelpers::GetModulePath() + L"batchGroundTruth\\" + param.input_images[i]; if (VideoFrameSource::FromSoftwareBitmap != param.output_video_frame_source && - OutputBindingStrategy::Unbound != param.output_binding_strategy) { + OutputBindingStrategy::Unbound != param.output_binding_strategy) { VideoFrame D3D_video_frame = output_video_frames.GetAt(i); VideoFrame SB_video_frame(BitmapPixelFormat::Bgra8, 720, 720); D3D_video_frame.as().CopyToAsync(SB_video_frame).get(); diff --git a/winml/test/model/compare_feature_value.cpp b/winml/test/model/compare_feature_value.cpp index 30b16c4ad5f73..ac2553987f5ad 100644 --- a/winml/test/model/compare_feature_value.cpp +++ b/winml/test/model/compare_feature_value.cpp @@ -13,7 +13,8 @@ template bool IsResultCloselyMatch(const T& outvalue, const T& expected_value, const double diff, const double tol) { if (diff > tol) return false; - if (std::isnan(diff) && !(std::isnan(outvalue) && std::isnan(expected_value)) && !(std::isinf(outvalue) && std::isinf(expected_value))) + if (std::isnan(diff) && !(std::isnan(outvalue) && std::isnan(expected_value)) && + !(std::isinf(outvalue) && std::isinf(expected_value))) return false; return true; } diff --git a/winml/test/model/model_tests.cpp b/winml/test/model/model_tests.cpp index 859914014b8bb..4087bfd87caa7 100644 --- a/winml/test/model/model_tests.cpp +++ b/winml/test/model/model_tests.cpp @@ -150,7 +150,8 @@ std::string GetTestDataPath() { std::string testDataPath(MAX_PATH, '\0'); auto environmentVariableFetchSuceeded = GetEnvironmentVariableA("WINML_TEST_DATA_PATH", testDataPath.data(), MAX_PATH); - if (environmentVariableFetchSuceeded == 0 && GetLastError() == ERROR_ENVVAR_NOT_FOUND || environmentVariableFetchSuceeded > MAX_PATH) { + if (environmentVariableFetchSuceeded == 0 && GetLastError() == ERROR_ENVVAR_NOT_FOUND || + environmentVariableFetchSuceeded > MAX_PATH) { // if the WINML_TEST_DATA_PATH environment variable cannot be found, attempt to find the hardcoded models folder std::wstring modulePath = FileHelpers::GetModulePath(); std::filesystem::path currPath = modulePath.substr(0, modulePath.find_last_of(L"\\")); @@ -357,7 +358,8 @@ bool ModifyNameIfDisabledTest(/*inout*/ std::string& testName, winml::LearningMo if (SkipGpuTests()) { reason = "GPU tests are not enabled for this build."; shouldSkip = true; - } else if (disabledGpuAdapterTests.find(testName) != disabledGpuAdapterTests.end() && ShouldSkipTestOnGpuAdapter(testName)) { + } else if (disabledGpuAdapterTests.find(testName) != disabledGpuAdapterTests.end() && + ShouldSkipTestOnGpuAdapter(testName)) { reason = disabledGpuAdapterTests[testName].second; shouldSkip = true; } @@ -386,9 +388,7 @@ std::string GetFullNameOfTest(ITestCase* testCase, winml::LearningModelDeviceKin name += tokenizedModelPath[tokenizedModelPath.size() - 2] += "_"; // model name name += tokenizedModelPath[tokenizedModelPath.size() - 3]; // opset version - std::replace_if( - name.begin(), name.end(), [](char c) { return !absl::ascii_isalnum(c); }, '_' - ); + std::replace_if(name.begin(), name.end(), [](char c) { return !absl::ascii_isalnum(c); }, '_'); // Determine if test should be skipped, using the generic name (no CPU or GPU suffix yet). bool isDisabled = ModifyNameIfDisabledTest(/*inout*/ name, deviceKind); diff --git a/winml/test/model/skip_model_tests.h b/winml/test/model/skip_model_tests.h index cf55d8bcbae7e..349332c6ae0e3 100644 --- a/winml/test/model/skip_model_tests.h +++ b/winml/test/model/skip_model_tests.h @@ -114,14 +114,14 @@ std::unordered_map disabledTests({ { "coreml_DecisionTreeClassifier_OpenML_1464_blood_transfusion_opset7", disabledTestDefaultReason}, { "coreml_AgeNet_ImageNet_opset7", disabledTestDefaultReason}, - // GPU specific cases: + // GPU specific cases: // ONNX zoo models { "mask_rcnn_opset10_GPU", "Bug 31005388: mask_rcnn opset 10 onnx zoo model fails to evaluate on DirectML https://microsoft.visualstudio.com/OS/_workitems/edit/31005388" }, { "faster_rcnn_opset10_GPU", "Bug 31005511: Failed to extract tensor data from evaluate result of faster_rcnn opset 10 model in DirectML https://microsoft.visualstudio.com/OS/_workitems/edit/31005511" }, - // ONNX model zoo's int8/qdq models generally do not work on CPUs that lack 8-bit instructions. + // ONNX model zoo's int8/qdq models generally do not work on CPUs that lack 8-bit instructions. { "YOLOv3_12_int8_opset12", disabledTestDefaultReason}, { "VGG_16_int8_opset12", disabledTestDefaultReason}, { "SSD_int8_opset12", disabledTestDefaultReason}, @@ -137,7 +137,7 @@ std::unordered_map disabledTests({ { "EfficientNet_Lite4_qdq_opset11", disabledTestDefaultReason}, { "EfficientNet_Lite4_int8_opset11", disabledTestDefaultReason}, - // Tier 2 models + // Tier 2 models { "fp16_test_tiny_yolov2_opset7_GPU", "Bug 31005780: Result of fp16_test_tiny_yolov2_opset7 and fp16_coreml_FNS_Candy_opset7 models on DirectML aren't as accurate as on CPU https://microsoft.visualstudio.com/OS/_workitems/edit/31005780"}, { "fp16_tiny_yolov2_opset8_GPU", diff --git a/winml/test/scenario/cppwinrt/NoisyReluCpu.h b/winml/test/scenario/cppwinrt/NoisyReluCpu.h index 5cccbae67407c..e419205fd52dc 100644 --- a/winml/test/scenario/cppwinrt/NoisyReluCpu.h +++ b/winml/test/scenario/cppwinrt/NoisyReluCpu.h @@ -65,12 +65,14 @@ struct NoisyReluOperator : winrt::implementsGetTensorDataType() == MLOperatorTensorDataType::Float && inputTensor->GetTensorDataType() == MLOperatorTensorDataType::Float) { + if (outputTensor->GetTensorDataType() == MLOperatorTensorDataType::Float && + inputTensor->GetTensorDataType() == MLOperatorTensorDataType::Float) { // For cpu data if (outputTensor->IsCpuData() && inputTensor->IsCpuData()) { ComputeInternal(inputTensor.get(), outputTensor.get(), inputDataSize); } - } else if (outputTensor->GetTensorDataType() == MLOperatorTensorDataType::Double && inputTensor->GetTensorDataType() == MLOperatorTensorDataType::Double) { + } else if (outputTensor->GetTensorDataType() == MLOperatorTensorDataType::Double && + inputTensor->GetTensorDataType() == MLOperatorTensorDataType::Double) { // For cpu data if (outputTensor->IsCpuData() && inputTensor->IsCpuData()) { ComputeInternal(inputTensor.get(), outputTensor.get(), inputDataSize); diff --git a/winml/test/scenario/cppwinrt/ReluCpu.h b/winml/test/scenario/cppwinrt/ReluCpu.h index 7bb275f7b399b..e8e91489fe872 100644 --- a/winml/test/scenario/cppwinrt/ReluCpu.h +++ b/winml/test/scenario/cppwinrt/ReluCpu.h @@ -60,12 +60,14 @@ struct ReluOperator : winrt::implements { } // If the tensor types are both float type - if (outputTensor->GetTensorDataType() == MLOperatorTensorDataType::Float && inputTensor->GetTensorDataType() == MLOperatorTensorDataType::Float) { + if (outputTensor->GetTensorDataType() == MLOperatorTensorDataType::Float && + inputTensor->GetTensorDataType() == MLOperatorTensorDataType::Float) { // For cpu data if (outputTensor->IsCpuData() && inputTensor->IsCpuData()) { ComputeInternal(inputTensor.get(), outputTensor.get(), inputDataSize); } - } else if (outputTensor->GetTensorDataType() == MLOperatorTensorDataType::Double && inputTensor->GetTensorDataType() == MLOperatorTensorDataType::Double) { + } else if (outputTensor->GetTensorDataType() == MLOperatorTensorDataType::Double && + inputTensor->GetTensorDataType() == MLOperatorTensorDataType::Double) { // For cpu data if (outputTensor->IsCpuData() && inputTensor->IsCpuData()) { ComputeInternal(inputTensor.get(), outputTensor.get(), inputDataSize); From f4edf9bb58911da401c128c70e088051bfbf93c5 Mon Sep 17 00:00:00 2001 From: Adrian Lizarraga Date: Wed, 24 Jul 2024 16:39:32 -0700 Subject: [PATCH 12/57] Extend QDQPropagation transformer to handle multiple consumers (#21313) ### Description - Extends the QDQPropagationTransformer to propagate DQs (forward) across operators with multiple consumers (previously only supported 1 consumer). - Adds Slice to the list of operators that the QDQPropagationTransformer can propagate DQ/Q ops across. - Supports QDQ propagation for opset 21. - Correctly copies Q or DQ attributes when creating new nodes. ### Motivation and Context The QDQPropagationTransformer fixes up QDQ node units for certain "data movement" ops (e.g., Transpose) by inserting Q -> DQ sequences where necessary. For example, the sequence `DQ -> Transpose -> Sigmoid` is transformed to `DQ -> Transpose -> Q -> DQ -> Sigmoid`. However, this fix-up does not currently support data movement ops with multiple consumers, as in: ``` DQ -> Transpose --+--> Sigmoid -> | +--> Relu -> | +-> graph_output ``` With the updates in this PR, the above model can be transformed to: ``` DQ -> Transpose -> Q --+--> DQ -> Sigmoid -> | +--> DQ -> Relu -> | +--> DQ -> graph_output ``` This update allows QNN EP to support quantized models created with tools that do not wrap data movement ops in Q/DQ ops. --------- Co-authored-by: Edward Chen <18449977+edgchen1@users.noreply.github.com> --- .../qdq_transformer/qdq_propagation.cc | 341 +++++++++++++----- .../optimizer/graph_transform_test_builder.cc | 4 +- .../test/optimizer/qdq_transformer_test.cc | 168 +++++++++ 3 files changed, 420 insertions(+), 93 deletions(-) diff --git a/onnxruntime/core/optimizer/qdq_transformer/qdq_propagation.cc b/onnxruntime/core/optimizer/qdq_transformer/qdq_propagation.cc index f0e76312d6e00..7b518947138a5 100644 --- a/onnxruntime/core/optimizer/qdq_transformer/qdq_propagation.cc +++ b/onnxruntime/core/optimizer/qdq_transformer/qdq_propagation.cc @@ -3,8 +3,13 @@ #include "core/optimizer/qdq_transformer/qdq_propagation.h" +#include #include +#include +#include +#include +#include "core/common/inlined_containers_fwd.h" #include "core/graph/extended_graph_edge.h" #include "core/graph/graph_utils.h" #include "core/optimizer/initializer.h" @@ -17,39 +22,147 @@ namespace onnxruntime { namespace { bool CanNodePropagate(const Node& node) { return graph_utils::IsSupportedOptypeVersionAndDomain(node, "MaxPool", {12}) || - graph_utils::IsSupportedOptypeVersionAndDomain(node, "Reshape", {5, 13, 14, 19}) || - graph_utils::IsSupportedOptypeVersionAndDomain(node, "Transpose", {1, 13}) || - graph_utils::IsSupportedOptypeVersionAndDomain(node, "Squeeze", {1, 11, 13}) || - graph_utils::IsSupportedOptypeVersionAndDomain(node, "Unsqueeze", {1, 11, 13}); + graph_utils::IsSupportedOptypeVersionAndDomain(node, "Reshape", {5, 13, 14, 19, 21}) || + graph_utils::IsSupportedOptypeVersionAndDomain(node, "Transpose", {1, 13, 21}) || + graph_utils::IsSupportedOptypeVersionAndDomain(node, "Squeeze", {1, 11, 13, 21}) || + graph_utils::IsSupportedOptypeVersionAndDomain(node, "Unsqueeze", {1, 11, 13, 21}) || + graph_utils::IsSupportedOptypeVersionAndDomain(node, "Slice", {1, 10, 11, 13}); } -// convert this: src_node -> dst_node -// to this: src_node -> Q -> DQ -> dst_node -// assumptions: -// 1. insertion_edge is valid - node indexes refer to valid nodes, arg name refers to a valid NodeArg, and it -// corresponds to an actual graph relationship -// 2. scale_initializer_nodearg and zp_initializer_nodearg_ptr (if not null) are constant initializers -Status InsertQDQPair(Graph& graph, const ExtendedGraphEdge& insertion_edge, - NodeArg& scale_initializer_nodearg, NodeArg* zp_initializer_nodearg_ptr, - const std::string& qdq_domain, const logging::Logger& logger) { - auto* src_node = insertion_edge.GetMutableNodeAtEnd(graph, ExtendedGraphEdge::End::Source); - auto* dst_node = insertion_edge.GetMutableNodeAtEnd(graph, ExtendedGraphEdge::End::Destination); - - ORT_ENFORCE(src_node || dst_node, "At least one graph node must be specified in the propagation edge."); - - const auto& base_name = insertion_edge.arg_name; +// Makes matching attributes for new QuantizeLinear nodes from an existing DequantizeLinear node. +NodeAttributes MakeQAttrsFromDQ(const Node& dq_node) { + assert(dq_node.SinceVersion() <= 21); // Checked by previous call to QDQ::MatchDQNode(). + // In opset <= 21, all DQ attributes (i.e., axis and block_size) are also Q attributes. + // So, set a copy of the DQ attributes. + return dq_node.GetAttributes(); +} + +// Makes matching attributes for new DequantizeLinear nodes from an existing QuantizeLinear node. +NodeAttributes MakeDQAttrsFromQ(const Node& q_node) { + assert(q_node.SinceVersion() <= 21); // Checked by previous call to QDQ::MatchQNode(). + const NodeAttributes& q_attrs = q_node.GetAttributes(); + if (q_attrs.empty()) { + return {}; + } + + // In opset <= 21, only the "axis" and "block_size" attributes for Q are also DQ attributes. + NodeAttributes dq_attrs; + + auto axis_attr_it = q_attrs.find("axis"); + if (axis_attr_it != q_attrs.end()) { + dq_attrs.insert({axis_attr_it->first, axis_attr_it->second}); + } + + auto block_size_attr_it = q_attrs.find("block_size"); + if (block_size_attr_it != q_attrs.end()) { + dq_attrs.insert({block_size_attr_it->first, block_size_attr_it->second}); + } + + return dq_attrs; +} + +// Validates edges into which to insert Q -> DQ ops. +// - Must have at least one edge. +// - All edges must correspond to the same graph NodeArg (i.e., same source but potentially different destination). +// - All edges must be attached to either a source node or a destination node. +Status ValidateQDQInsertionEdges(Graph& graph, gsl::span insertion_edges) { + const size_t num_edges = insertion_edges.size(); + ORT_RETURN_IF(num_edges == 0, "Expected at least one edge into which to insert QDQ pair."); + + const ExtendedGraphEdge& first_edge = insertion_edges[0]; + const Node* src_node = first_edge.GetNodeAtEnd(graph, ExtendedGraphEdge::End::Source); + const Node* first_dst_node = first_edge.GetNodeAtEnd(graph, ExtendedGraphEdge::End::Destination); + const std::string& node_arg_name = first_edge.arg_name; + ORT_RETURN_IF_NOT(graph.GetNodeArg(node_arg_name) != nullptr, + "QDQ insertion edge does not have a valid graph NodeArg for ", node_arg_name); + ORT_RETURN_IF_NOT(src_node != nullptr || first_dst_node != nullptr, + "QDQ insertion edge [0] for NodeArg ", node_arg_name, + " must have a source or a destination node"); + + for (size_t i = 1; i < num_edges; i++) { + const ExtendedGraphEdge& insertion_edge = insertion_edges[i]; + ORT_RETURN_IF_NOT(insertion_edge.arg_name == node_arg_name, + "QDQ insertion edge [", i, "] has NodeArg ", insertion_edge.arg_name, + " but expected NodeArg ", node_arg_name); + + const Node* edge_dst_node = insertion_edge.GetNodeAtEnd(graph, ExtendedGraphEdge::End::Destination); + ORT_RETURN_IF_NOT(src_node != nullptr || edge_dst_node != nullptr, + "QDQ insertion edge [", i, "] for NodeArg ", node_arg_name, + " must have a source or a destination node"); + } + + return Status::OK(); +} + +// Logs information about the edges into which Q/DQ nodes will be inserted in InsertQDQPairs(). +// Assumes the edges have already been validated. +void LogQDQInsertion(const logging::Logger& logger, logging::Severity severity, const CodeLocation& code_location, + const Graph& graph, gsl::span edges) { + auto logging_data_type = logging::DataType::SYSTEM; + if (!logger.OutputIsEnabled(severity, logging_data_type)) { + return; + } + + const Node* src_node = edges[0].GetNodeAtEnd(graph, ExtendedGraphEdge::End::Source); + const auto& node_arg_name = edges[0].arg_name; + std::string src_label = src_node ? MakeString("node (\"", src_node->Name(), "\", index: ", src_node->Index(), ")") + : "input"; + std::ostringstream dst_labels; + const size_t num_edges = edges.size(); + + for (size_t i = 0; i < num_edges; ++i) { + const ExtendedGraphEdge& edge = edges[i]; + const Node* dst_node = edge.GetNodeAtEnd(graph, ExtendedGraphEdge::End::Destination); + dst_labels << (dst_node ? MakeString("dst node (\"", dst_node->Name(), "\", index: ", dst_node->Index(), ")") + : "output") + << (i == num_edges - 1 ? "" : ","); + } + + logging::Capture(logger, severity, logging::Category::onnxruntime, logging_data_type, code_location).Stream() + << "Inserted Q/DQ pair between " + << (src_node ? MakeString("src node (\"", src_node->Name(), "\", index: ", src_node->Index(), ")") + : "input") + << " and " << dst_labels.str() + << " at NodeArg \"" << node_arg_name << "\"."; +} + +// convert this: src_node (or graph input) --+--> dst_node_0 (or graph output) +// | +// +--> dst_node_1 +// | ... +// +--> dst_node_n +// +// to this: src_node (or graph input) -> Q --+--> DQ -> dst_node_0 (or graph output) +// | +// +--> DQ -> dst_node_1 +// | ... +// +--> DQ -> dst_node_n +// Checks that all insertion edges share the same NodeArg. That is, the edges originate from the same source node +// output. If there is no src_node, then all edges should come from the same graph input. +// This function returns an error status if edges are invalid. +// +// Assumes that scale_initializer_nodearg and zp_initializer_nodearg_ptr (if not null) are constant initializers. +Status InsertQDQPairs(Graph& graph, gsl::span insertion_edges, + NodeArg& scale_initializer_nodearg, NodeArg* zp_initializer_nodearg_ptr, + const std::string& qdq_domain, const NodeAttributes& q_attrs, const NodeAttributes& dq_attrs, + const logging::Logger& logger) { + ORT_RETURN_IF_ERROR(ValidateQDQInsertionEdges(graph, insertion_edges)); + + const ExtendedGraphEdge& first_edge = insertion_edges[0]; // ValidateQDQInsertionEdges() guarantees at least one edge + + Node* src_node = first_edge.GetMutableNodeAtEnd(graph, ExtendedGraphEdge::End::Source); // nullptr for graph input + const auto& base_name = first_edge.arg_name; auto& base_node_arg = *graph.GetNodeArg(base_name); - LOGS(logger, VERBOSE) << "Inserting Q/DQ pair between " - << (src_node ? MakeString("node (\"", src_node->Name(), "\", index: ", src_node->Index(), ")") - : "input") - << " and " - << (dst_node ? MakeString("node (\"", dst_node->Name(), "\", index: ", dst_node->Index(), ")") - : "output") - << " at NodeArg \"" << base_name << "\"."; + LogQDQInsertion(logger, logging::Severity::kVERBOSE, ORT_WHERE, graph, insertion_edges); - // set up new NodeArgs - auto& pre_q_nodearg = insertion_edge.HasGraphInputOrInitializer() + auto make_q_or_dq_inputs = [](NodeArg& data, NodeArg& scale, NodeArg* zero_point) { + return zero_point ? InlinedVector{&data, &scale, zero_point} + : InlinedVector{&data, &scale}; + }; + + // Create Q node that will be inserted after src_node + auto& pre_q_nodearg = first_edge.HasGraphInputOrInitializer() ? base_node_arg : graph.GetOrCreateNodeArg(graph.GenerateNodeArgName(base_name + "_pre_q"), nullptr); @@ -57,17 +170,6 @@ Status InsertQDQPair(Graph& graph, const ExtendedGraphEdge& insertion_edge, auto& q_to_dq_nodearg = graph.GetOrCreateNodeArg(graph.GenerateNodeArgName(base_name + "_q_to_dq"), nullptr); - auto& post_dq_nodearg = insertion_edge.HasGraphOutput() - ? base_node_arg - : graph.GetOrCreateNodeArg(graph.GenerateNodeArgName(base_name + "_post_dq"), - nullptr); - - // set up new Nodes - auto make_q_or_dq_inputs = [](NodeArg& data, NodeArg& scale, NodeArg* zero_point) { - return zero_point ? std::vector{&data, &scale, zero_point} - : std::vector{&data, &scale}; - }; - auto& q_node = graph.AddNode(graph.GenerateNodeName(base_name + "_q"), QDQ::QOpName, "Inserted by QDQPropagationTransformer", @@ -76,40 +178,61 @@ Status InsertQDQPair(Graph& graph, const ExtendedGraphEdge& insertion_edge, zp_initializer_nodearg_ptr), // outputs {&q_to_dq_nodearg}, - nullptr, // attributes + &q_attrs, // attributes qdq_domain); ORT_RETURN_IF_NOT(graph.SetOpSchemaFromRegistryForNode(q_node), "Failed to set op schema for added Q node."); - auto& dq_node = graph.AddNode(graph.GenerateNodeName(base_name + "_dq"), - QDQ::DQOpName, - "Inserted by QDQPropagationTransformer", - // inputs - make_q_or_dq_inputs(q_to_dq_nodearg, scale_initializer_nodearg, - zp_initializer_nodearg_ptr), - // outputs - {&post_dq_nodearg}, - nullptr, // attributes - qdq_domain); - - ORT_RETURN_IF_NOT(graph.SetOpSchemaFromRegistryForNode(dq_node), "Failed to set op schema for added DQ node."); - - // set up edges - if (src_node && dst_node) { - graph.RemoveEdge(src_node->Index(), dst_node->Index(), - insertion_edge.src->arg_idx, insertion_edge.dst->arg_idx); - } - if (src_node) { - src_node->MutableOutputDefs()[insertion_edge.src->arg_idx] = &pre_q_nodearg; - graph.AddEdge(src_node->Index(), q_node.Index(), insertion_edge.src->arg_idx, 0); - } + // Remove original edges between src and dst nodes. + for (const auto& insertion_edge : insertion_edges) { + auto* dst_node = insertion_edge.GetMutableNodeAtEnd(graph, ExtendedGraphEdge::End::Destination); + + if (dst_node) { + graph.RemoveEdge(src_node->Index(), dst_node->Index(), + insertion_edge.src->arg_idx, insertion_edge.dst->arg_idx); + } + } - graph.AddEdge(q_node.Index(), dq_node.Index(), 0, 0); + // Add edge from src to Q node. + src_node->MutableOutputDefs()[first_edge.src->arg_idx] = &pre_q_nodearg; + graph.AddEdge(src_node->Index(), q_node.Index(), first_edge.src->arg_idx, 0); + } - if (dst_node) { - dst_node->MutableInputDefs()[insertion_edge.dst->arg_idx] = &post_dq_nodearg; - graph.AddEdge(dq_node.Index(), dst_node->Index(), 0, insertion_edge.dst->arg_idx); + // Create a DQ node for each dst node and connect remaining edges. + for (size_t edge_idx = 0; edge_idx < insertion_edges.size(); ++edge_idx) { + const auto& insertion_edge = insertion_edges[edge_idx]; + const std::string edge_suffix = edge_idx == 0 ? "" : std::to_string(edge_idx); + auto& post_dq_nodearg = insertion_edge.HasGraphOutput() + ? base_node_arg + : graph.GetOrCreateNodeArg(graph.GenerateNodeArgName(MakeString(base_name, + "_post_dq", + edge_suffix)), + nullptr); + + auto& dq_node = graph.AddNode(graph.GenerateNodeName(MakeString(base_name, "_dq", edge_suffix)), + QDQ::DQOpName, + "Inserted by QDQPropagationTransformer", + // inputs + make_q_or_dq_inputs(q_to_dq_nodearg, scale_initializer_nodearg, + zp_initializer_nodearg_ptr), + // outputs + {&post_dq_nodearg}, + &dq_attrs, // attributes + qdq_domain); + + ORT_RETURN_IF_NOT(graph.SetOpSchemaFromRegistryForNode(dq_node), "Failed to set op schema for added DQ node."); + + Node* dst_node = insertion_edge.GetMutableNodeAtEnd(graph, ExtendedGraphEdge::End::Destination); + + // Add edge from Q to DQ + graph.AddEdge(q_node.Index(), dq_node.Index(), 0, 0); + + // Add edge from DQ to dst_node + if (dst_node) { + dst_node->MutableInputDefs()[insertion_edge.dst->arg_idx] = &post_dq_nodearg; + graph.AddEdge(dq_node.Index(), dst_node->Index(), 0, insertion_edge.dst->arg_idx); + } } return Status::OK(); @@ -156,37 +279,39 @@ std::optional GetPreviousPropagationEdge(const Graph& graph, return GetPreviousEdge(graph, *src_node); } -std::optional GetNextEdge(const Graph& graph, const Node& node) { - // for now we can just consider the first output (index 0) +InlinedVector GetNextEdges(const Graph& graph, const Node& node) { + constexpr int node_output_index = 0; // for now we can just consider the first output (index 0) + InlinedVector next_edges; + const auto output_edges = graph_utils::GraphEdge::GetNodeOutputEdges(node, static_cast(node_output_index)); - const auto output_edges = graph_utils::GraphEdge::GetNodeOutputEdges(node, 0); - if (output_edges.empty()) { - // maybe edge to output - return ExtendedGraphEdge::TryCreateFromNodeToOutput(graph, node, 0); + // edges to next nodes + for (const auto& output_edge : output_edges) { + next_edges.push_back(ExtendedGraphEdge::CreateFromValidGraphEdge(output_edge)); } - if (!graph.IsOutput(node.OutputDefs()[0]) && output_edges.size() == 1) { - // single edge to next node - return ExtendedGraphEdge::CreateFromValidGraphEdge(output_edges.front()); + // maybe edge to graph output + auto edge_to_output = ExtendedGraphEdge::TryCreateFromNodeToOutput(graph, node, node_output_index); + if (edge_to_output.has_value()) { + next_edges.push_back(edge_to_output.value()); } - return std::nullopt; + return next_edges; } -std::optional GetNextPropagationEdge(const Graph& graph, - const ExtendedGraphEdge& edge) { +InlinedVector GetNextPropagationEdges(const Graph& graph, + const ExtendedGraphEdge& edge) { if (edge.HasGraphOutput()) { - return std::nullopt; + return {}; } const auto* dst_node = edge.GetNodeAtEnd(graph, ExtendedGraphEdge::End::Destination); ORT_ENFORCE(dst_node != nullptr); if (!CanNodePropagate(*dst_node)) { - return std::nullopt; + return {}; } - return GetNextEdge(graph, *dst_node); + return GetNextEdges(graph, *dst_node); } class GraphConstantInitializerGetter { @@ -228,21 +353,54 @@ Status PropagateDQForward(Graph& graph, gsl::span node_indices, ? dq_node.MutableInputDefs()[QDQ::InputIndex::ZERO_POINT_ID] : nullptr; - const auto edge_after_dq = GetNextEdge(graph, dq_node); - if (!edge_after_dq) { + const InlinedVector edges_after_dq = GetNextEdges(graph, dq_node); + if (edges_after_dq.size() != 1) { continue; } - for (auto curr_edge = GetNextPropagationEdge(graph, *edge_after_dq); - curr_edge.has_value(); - curr_edge = GetNextPropagationEdge(graph, *curr_edge)) { - if (const auto* dst_node = curr_edge->GetNodeAtEnd(graph, ExtendedGraphEdge::End::Destination); - dst_node && QDQ::MatchQNode(*dst_node)) { - break; + // Utility function to check if any edge out of a node (e.g., Transpose) ends in a Q node. + auto any_edge_ends_in_q = [](Graph& graph, const InlinedVector& edges) -> bool { + for (const auto& edge : edges) { + const auto* edge_dst_node = edge.GetNodeAtEnd(graph, ExtendedGraphEdge::End::Destination); + if (edge_dst_node && QDQ::MatchQNode(*edge_dst_node)) { + return true; + } + } + return false; + }; + + // Propagate DQ forward in a BFS traversal of NodeArg edges. A NodeArg "edge group" consists of one or more edges + // that all begin at the same source node's output slot and end at a graph output or a destination node. + // Ex: The subgraph below shows a NodeArg edge group (containing 3 edges) that begins at a + // Transpose, ends at two destination nodes, and produces a graph output. + // DQ -> Transpose --+--> Sigmoid -> ... + // | + // +--> Slice -> ... + // | + // +--> graph_output + std::queue> node_arg_edges; + node_arg_edges.push(GetNextPropagationEdges(graph, edges_after_dq[0])); + + while (!node_arg_edges.empty()) { + const InlinedVector curr_edge_group = std::move(node_arg_edges.front()); + node_arg_edges.pop(); + + // Skip if edge group is empty. Also, to keep things simple, we do not yet handle edge groups in which + // one of the destination nodes is already a QuantizeLinear node. Ex: + // DQ -> Transpose --+--> QuantizeLinear -> ... + // | + // +--> Slice -> ... + if (curr_edge_group.empty() || any_edge_ends_in_q(graph, curr_edge_group)) { + continue; } - ORT_RETURN_IF_ERROR(InsertQDQPair(graph, *curr_edge, dq_scale, dq_zero_point, dq_node.Domain(), logger)); + ORT_RETURN_IF_ERROR(InsertQDQPairs(graph, curr_edge_group, dq_scale, dq_zero_point, dq_node.Domain(), + MakeQAttrsFromDQ(dq_node), dq_node.GetAttributes(), logger)); modified = true; + + for (const auto& edge : curr_edge_group) { + node_arg_edges.push(GetNextPropagationEdges(graph, edge)); + } } } @@ -290,7 +448,8 @@ Status PropagateQBackward(Graph& graph, gsl::span node_indices, break; } - ORT_RETURN_IF_ERROR(InsertQDQPair(graph, *curr_edge, q_scale, q_zero_point, q_node.Domain(), logger)); + ORT_RETURN_IF_ERROR(InsertQDQPairs(graph, InlinedVector{*curr_edge}, q_scale, q_zero_point, + q_node.Domain(), q_node.GetAttributes(), MakeDQAttrsFromQ(q_node), logger)); modified = true; } } diff --git a/onnxruntime/test/optimizer/graph_transform_test_builder.cc b/onnxruntime/test/optimizer/graph_transform_test_builder.cc index 2cbfbbb317642..03a71868a3dc1 100644 --- a/onnxruntime/test/optimizer/graph_transform_test_builder.cc +++ b/onnxruntime/test/optimizer/graph_transform_test_builder.cc @@ -246,14 +246,14 @@ Status TestGraphTransformer(const std::function& ORT_RETURN_IF_ERROR(pre_graph_checker(graph)); } #if SAVE_TEST_GRAPH - ORT_RETURN_IF_ERROR(Model::Save(model, "model_original.onnx")); + ORT_RETURN_IF_ERROR(Model::Save(model, ToPathString("model_original.onnx"))); #endif ORT_RETURN_IF_ERROR(graph_transformation_mgr.ApplyTransformers(graph, level, logger)); if (post_graph_checker) { ORT_RETURN_IF_ERROR(post_graph_checker(graph)); } #if SAVE_TEST_GRAPH - ORT_RETURN_IF_ERROR(Model::Save(model, "model_optimized.onnx")); + ORT_RETURN_IF_ERROR(Model::Save(model, ToPathString("model_optimized.onnx"))); #endif }; diff --git a/onnxruntime/test/optimizer/qdq_transformer_test.cc b/onnxruntime/test/optimizer/qdq_transformer_test.cc index 14c5b60d6e0bd..fb85eb4c29bb6 100644 --- a/onnxruntime/test/optimizer/qdq_transformer_test.cc +++ b/onnxruntime/test/optimizer/qdq_transformer_test.cc @@ -12,6 +12,7 @@ #include "core/mlas/inc/mlas.h" #include "core/optimizer/double_qdq_pairs_remover.h" #include "core/optimizer/qdq_transformer/qdq_final_cleanup.h" +#include "core/optimizer/qdq_transformer/qdq_propagation.h" #include "core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.h" #include "core/optimizer/qdq_transformer/selectors_actions/qdq_selector_action_transformer.h" #include "core/optimizer/qdq_transformer/selectors_actions/shared/utils.h" @@ -3084,6 +3085,57 @@ TEST(QDQTransformerTests, QDQPropagation_QBackward) { #endif } +// Test backwards propagation of a QuantizeLinear node that uses the "output_dtype" attribute +// to set the quantization type (i.e., does not have an explicit zero-point input). This tests +// the copying of attributes for QDQ propagation. +TEST(QDQTransformerTests, QDQPropagation_QBackward_NoZP_OutputDtypeAttribute) { + auto test_case = [&](ONNX_NAMESPACE::TensorProto_DataType q_output_type) { + auto build_test_case = [&](ModelTestBuilder& builder) { + auto* input_arg = builder.MakeInput({1, 2, 2}, {-2.0f, 0.0f, 1.0f, 2.0f}); + auto* output_arg = builder.MakeOutput(); + + // add Add + auto* const_1_input = builder.MakeScalarInitializer(1.0f); + auto* add_output = builder.MakeIntermediate(); + builder.AddNode("Add", {input_arg, const_1_input}, {add_output}); + + // add Transpose + auto* transpose_output = builder.MakeIntermediate(); + builder.AddNode("Transpose", {add_output}, {transpose_output}); + + // add Q with a "output_dtype" attribute. Omit the zero-point input (defaults to 0). + constexpr float qdq_scale = 1.0f; + Node& q_node = builder.AddQuantizeLinearNode(transpose_output, qdq_scale, output_arg); + q_node.AddAttribute("output_dtype", static_cast(q_output_type)); + }; + + auto check_graph = [&](InferenceSessionWrapper& session) { + const QDQOpKeys qdq_keys = GetQDQOpKeys(false); + std::vector expected_op_types_in_order = { + "Add", + qdq_keys.quantize_linear, + qdq_keys.dequantize_linear, + "Transpose", + qdq_keys.quantize_linear, + }; + + const auto op_types_in_order = GetNodeOpTypesInTopologicalOrder(session.GetGraph(), true); + EXPECT_EQ(op_types_in_order, expected_op_types_in_order); + }; + + TransformerTester(build_test_case, + check_graph, + TransformerLevel::Default, + TransformerLevel::Level1, + 21); // Opset >= 21 supports the "output_dtype" attribute + }; + + test_case(ONNX_NAMESPACE::TensorProto_DataType_UINT8); + test_case(ONNX_NAMESPACE::TensorProto_DataType_INT8); + test_case(ONNX_NAMESPACE::TensorProto_DataType_UINT16); + test_case(ONNX_NAMESPACE::TensorProto_DataType_INT16); +} + TEST(QDQTransformerTests, QDQPropagation_DQForward) { auto test_case = [&](const std::vector& input_shape, size_t maxpool_dim, @@ -3420,6 +3472,122 @@ TEST(QDQTransformerTests, QDQPropagation_DQ_Q) { #endif } +// Test propagating a DQ forward through a chain of Slice and Transpose operators that have multiple consumers. +// original model: +// in0 -> DQ -> Slice --+--> slice_out +// | +// +--> Add -> out0 +// | +// +--> Transpose --+--> Pow -> out1 +// | | +// | +--> Pow -> out2 +// | +// +--> Transpose --+--> Pow -> out3 +// | +// +--> Pow -> out4 +// expected model: +// in0 -> DQ -> Slice -> Q --+--> DQ -> slice_out +// | +// +--> DQ -> Add -> out0 +// | +// +--> DQ -> TP -> Q --+--> DQ -> Pow -> out1 +// | | +// | +--> DQ -> Pow -> out2 +// | +// +--> DQ -> TP -> Q --+--> DQ -> Pow -> out3 +// | +// +--> DQ -> Pow -> out4 +TEST(QDQTransformerTests, QDQPropagation_DQForward_SliceMultipleConsumers) { + auto run_test_case = [&](bool slice_has_graph_output) { + auto build_test_case = [&](ModelTestBuilder& builder) { + std::vector input0_shape = {1, 2, 2, 2}; + std::vector input1_shape = {1, 1, 1, 1}; + auto* input0_arg = builder.MakeInput(input0_shape, + std::numeric_limits::min(), + std::numeric_limits::max()); + auto* input1_arg = builder.MakeInput(input1_shape, {0.0f}); + auto* output0_arg = builder.MakeOutput(); + auto* output1_arg = builder.MakeOutput(); + auto* output2_arg = builder.MakeOutput(); + auto* output3_arg = builder.MakeOutput(); + auto* output4_arg = builder.MakeOutput(); + + // DQ + constexpr float qdq_scale = 1.0f; + constexpr uint8_t qdq_zero_point = 128; + auto* dq_output = builder.MakeIntermediate(); + builder.AddDequantizeLinearNode(input0_arg, qdq_scale, qdq_zero_point, dq_output); + + // Slice + auto* slice_output = slice_has_graph_output ? builder.MakeOutput() : builder.MakeIntermediate(); + auto* slice_starts = builder.Make1DInitializer(std::vector{0, 0, 0, 0}); + auto* slice_ends = builder.Make1DInitializer(std::vector{1, 1, 1, 1}); + builder.AddNode("Slice", {dq_output, slice_starts, slice_ends}, {slice_output}); + + // Add + builder.AddNode("Add", {slice_output, input1_arg}, {output0_arg}); + + // Transpose + auto* transpose0_output = builder.MakeIntermediate(); + builder.AddNode("Transpose", {slice_output}, {transpose0_output}); + + // Transpose + auto* transpose1_output = builder.MakeIntermediate(); + builder.AddNode("Transpose", {slice_output}, {transpose1_output}); + + // Pows + auto* pow_exp = builder.MakeScalarInitializer(2.0f); + builder.AddNode("Pow", {transpose0_output, pow_exp}, {output1_arg}); + builder.AddNode("Pow", {transpose0_output, pow_exp}, {output2_arg}); + builder.AddNode("Pow", {transpose1_output, pow_exp}, {output3_arg}); + builder.AddNode("Pow", {transpose1_output, pow_exp}, {output4_arg}); + }; + + auto check_graph = [&](InferenceSessionWrapper& session) { + const QDQOpKeys qdq_keys = GetQDQOpKeys(false); + std::vector expected_op_types_in_order; + expected_op_types_in_order.reserve(20); + expected_op_types_in_order.insert(expected_op_types_in_order.end(), + {qdq_keys.dequantize_linear, + "Slice", + qdq_keys.quantize_linear}); + + if (slice_has_graph_output) { + // Should have a DQ before the graph output generated by the Slice. + expected_op_types_in_order.push_back(qdq_keys.dequantize_linear); + } + + expected_op_types_in_order.insert(expected_op_types_in_order.end(), + {qdq_keys.dequantize_linear, + "Add", + qdq_keys.dequantize_linear, + "Transpose", + qdq_keys.quantize_linear, qdq_keys.dequantize_linear, + "Pow", + qdq_keys.dequantize_linear, + "Pow", + qdq_keys.dequantize_linear, + "Transpose", + qdq_keys.quantize_linear, qdq_keys.dequantize_linear, + "Pow", + qdq_keys.dequantize_linear, + "Pow"}); + + const auto op_types_in_order = GetNodeOpTypesInTopologicalOrder(session.GetGraph(), true); + EXPECT_EQ(op_types_in_order, expected_op_types_in_order); + }; + + TransformerTester(build_test_case, + check_graph, + TransformerLevel::Default, + TransformerLevel::Level1, + 18, 0.0, 0.0, std::make_unique()); + }; + + run_test_case(/*slice_has_graph_output*/ false); + run_test_case(/*slice_has_graph_output*/ true); +} + TEST(QDQTransformerTests, QDQ_Selector_Test) { const ORTCHAR_T* model_file_name = ORT_TSTR("testdata/transform/qdq_conv.onnx"); From 08001d18ac41ee2fe95ce9d4d064c2fb725e583f Mon Sep 17 00:00:00 2001 From: pengwa Date: Thu, 25 Jul 2024 08:25:22 +0800 Subject: [PATCH 13/57] Fix security issue #22016 #22017 #22018 (#21333) ### Description ### Motivation and Context --- .../memory_optimizer/recompute_analysis.cc | 814 +++++++++--------- .../training_api/core/training_api_tests.cc | 3 +- .../orttraining/training_api/checkpoint.cc | 2 +- 3 files changed, 416 insertions(+), 403 deletions(-) diff --git a/orttraining/orttraining/core/optimizer/memory_optimizer/recompute_analysis.cc b/orttraining/orttraining/core/optimizer/memory_optimizer/recompute_analysis.cc index 8d110c692751e..1135ef41cfc47 100644 --- a/orttraining/orttraining/core/optimizer/memory_optimizer/recompute_analysis.cc +++ b/orttraining/orttraining/core/optimizer/memory_optimizer/recompute_analysis.cc @@ -67,410 +67,422 @@ using OpsetToIgnorableIndicesMap = InlinedHashMap; * or not. * 3. Some ops are not supported in older opsets, we need to check whether it is applicable to recompute or not. */ -const InlinedHashMap& GetAllowedRecomputeOps(int probe_op_level) { - static InlinedHashMap> recomputable_op_table_map; - if (recomputable_op_table_map.find(probe_op_level) != recomputable_op_table_map.end()) { - return recomputable_op_table_map.at(probe_op_level); - } +InlinedHashMap> InitializeRecomputableOpTable() { + InlinedHashMap> recomputable_op_table_map; + + constexpr const int basic_op_level = static_cast(ProbeLevel::Basic); + recomputable_op_table_map.insert({basic_op_level, InlinedHashMap()}); + auto& basic_recomputable_op_table = recomputable_op_table_map.at(basic_op_level); + + basic_recomputable_op_table.insert({ + { + utils::GetFullQualifiedOpName("Add", kOnnxDomain), + { + {1, {}}, + {6, {}}, + {7, {}}, + {13, {}}, + {14, {}}, + }, + }, + { + utils::GetFullQualifiedOpName("BatchNormalization", kOnnxDomain), + { + {1, {}}, + {6, {}}, + {7, {}}, + {9, {}}, + {14, {}}, + {15, {}}, + }, + }, + { + utils::GetFullQualifiedOpName("BiasGelu", kMSDomain), + { + {1, {}}, + }, + }, + { + utils::GetFullQualifiedOpName("BiasDropout", kMSDomain), + { + {1, {3, 4}}, // ignore ratio (optional) and training mode (optional) + }, + }, + { + utils::GetFullQualifiedOpName("BitmaskBiasDropout", kMSDomain), + { + {1, {3, 4}}, // ignore ratio (optional) and training mode (optional) + }, + }, + { + utils::GetFullQualifiedOpName("BitmaskDropout", kMSDomain), + { + {1, {1, 2}}, // ignore ratio (optional) and training mode (optional) + }, + }, + { + utils::GetFullQualifiedOpName("Cast", kOnnxDomain), + { + {1, {}}, + {6, {}}, + {9, {}}, + {13, {}}, + {19, {}}, + }, + }, + { + utils::GetFullQualifiedOpName("ConcatTraining", kMSDomain), + { + {1, {}}, + + }, + }, + { + utils::GetFullQualifiedOpName("ConstantOfShape", kOnnxDomain), + { + {9, {0}}, // ignore the `input`, e.g. the shape of the expected output tensor + {20, {0}}, + }, + }, + { + utils::GetFullQualifiedOpName("Cos", kOnnxDomain), + { + {7, {}}, + }, + }, + { + utils::GetFullQualifiedOpName("CumSum", kOnnxDomain), + { + // The axis input is trivial + {11, {1}}, + {14, {1}}, + }, + }, + { + utils::GetFullQualifiedOpName("Dropout", kOnnxDomain), + { + // ONNX Dropout 1, 6, 7, 10 do not have seed attribute, so we remove them from the recompute support. + {12, {1, 2}}, // ignore ratio and training_mode + {13, {1, 2}}, + }, + }, + { + utils::GetFullQualifiedOpName("Div", kOnnxDomain), + { + {1, {}}, + {6, {}}, + {7, {}}, + {13, {}}, + {14, {}}, + }, + }, + { + utils::GetFullQualifiedOpName("Einsum", kOnnxDomain), + { + {12, {}}, + }, + }, + { + utils::GetFullQualifiedOpName("Equal", kOnnxDomain), + { + {1, {}}, + {7, {}}, + {11, {}}, + {13, {}}, + {19, {}}, + }, + }, + { + utils::GetFullQualifiedOpName("Expand", kOnnxDomain), + { + {8, {1}}, // Ignore the shape. + {13, {1}}, + }, + }, + { + utils::GetFullQualifiedOpName("FastGelu", kMSDomain), + { + {1, {}}, + }, + }, + { + utils::GetFullQualifiedOpName("FlattenAndUnpad", kMSDomain), + { + {1, {1}}, // ignore the indices + }, + }, + { + utils::GetFullQualifiedOpName("Gather", kOnnxDomain), + { + {1, {1}}, // ignore the indices + {11, {1}}, + {13, {1}}, + }, + }, + { + utils::GetFullQualifiedOpName("Gelu", kOnnxDomain), + { + {20, {}}, + }, + }, + { + utils::GetFullQualifiedOpName("Gelu", kMSDomain), + { + {1, {}}, + }, + }, + { + utils::GetFullQualifiedOpName("Gemm", kOnnxDomain), + { + {1, {}}, + {6, {}}, + {7, {}}, + {9, {}}, + {11, {}}, + {13, {}}, + }, + }, + { + utils::GetFullQualifiedOpName("Less", kOnnxDomain), + { + {1, {}}, + {7, {}}, + {9, {}}, + {13, {}}, + }, + }, + { + utils::GetFullQualifiedOpName("MemcpyFromHost", kOnnxDomain), + { + {1, {0}}, // Ignore CPU input. + }, + }, + { + utils::GetFullQualifiedOpName("Mul", kOnnxDomain), + { + {1, {}}, + {6, {}}, + {7, {}}, + {13, {}}, + {14, {}}, + }, + }, + { + utils::GetFullQualifiedOpName("Neg", kOnnxDomain), + { + {1, {}}, + {6, {}}, + {13, {}}, + }, + }, + { + utils::GetFullQualifiedOpName("NonZero", kOnnxDomain), + { + {9, {}}, + {13, {}}, + }, + }, + { + utils::GetFullQualifiedOpName("PadAndUnflatten", kMSDomain), + { + {1, {1, 2}}, // ignore the indices and unflatten_dims + }, + }, + { + // Be noted, NOT all PythonOp will be allowed to recompute, there will be further check. + utils::GetFullQualifiedOpName("PythonOp", kMSDomain), + { + {1, {}}, + }, + }, + { + utils::GetFullQualifiedOpName("Range", kOnnxDomain), + { + {11, {0, 1, 2}}, // ignore start, end, delta, because they are scalars. + }, + }, + { + utils::GetFullQualifiedOpName("Reshape", kOnnxDomain), + { + {1, {}}, + {5, {}}, // ignore the shape. + {13, {}}, + {14, {}}, + {19, {}}, + }, + }, + { + utils::GetFullQualifiedOpName("Sin", kOnnxDomain), + { + {7, {}}, + }, + }, + { + utils::GetFullQualifiedOpName("Slice", kOnnxDomain), + { + {1, {}}, + {10, {1, 2, 3, 4}}, // ignore starts, ends, axes (optional) and steps (optional) + {11, {1, 2, 3, 4}}, + {13, {1, 2, 3, 4}}, + }, + }, + { + utils::GetFullQualifiedOpName("Split", kOnnxDomain), + { + {1, {1}}, // ignore split (optional) + {2, {}}, + {11, {}}, + {13, {1}}, // ignore the split (optional) + {18, {1}}, + }, + }, + { + utils::GetFullQualifiedOpName("Squeeze", kOnnxDomain), + { + {1, {}}, + {11, {}}, + {13, {1}}, // ignore the axes (optional) + }, + }, + { + utils::GetFullQualifiedOpName("Sub", kOnnxDomain), + { + {1, {}}, + {6, {}}, + {7, {}}, + {13, {}}, + {14, {}}, + }, + }, + { + utils::GetFullQualifiedOpName("Tile", kOnnxDomain), + { + {1, {1, 2}}, + {6, {1}}, + {13, {1}}, + }, + }, + { + utils::GetFullQualifiedOpName("Transpose", kOnnxDomain), + { + {1, {}}, + {13, {}}, + }, + }, + { + utils::GetFullQualifiedOpName("Trilu", kOnnxDomain), + { + {14, {1}}, // ignore k (optional) + }, + }, + { + utils::GetFullQualifiedOpName("QuickGelu", kMSDomain), + { + {1, {}}, + }, + }, + { + utils::GetFullQualifiedOpName("Unsqueeze", kOnnxDomain), + { + {1, {}}, + {11, {}}, + {13, {1}}, // ignore the axes (optional) + }, + }, + { + utils::GetFullQualifiedOpName("Where", kOnnxDomain), + { + {9, {}}, + {16, {}}, + }, + }, + + }); + + constexpr const int advanced_op_level = static_cast(ProbeLevel::Advanced); + recomputable_op_table_map.insert({advanced_op_level, InlinedHashMap()}); + auto& advanced_recomputable_op_table = recomputable_op_table_map.at(advanced_op_level); + // Append basic_recomputable_op_table to advanced_recomputable_op_table. + advanced_recomputable_op_table.insert(recomputable_op_table_map.at(basic_op_level).begin(), + recomputable_op_table_map.at(basic_op_level).end()); + + advanced_recomputable_op_table.insert({ + { + utils::GetFullQualifiedOpName("BiasSoftmax", kMSDomain), + { + {1, {}}, + }, + }, + { + utils::GetFullQualifiedOpName("BiasSoftmaxDropout", kMSDomain), + { + {1, {2}}, // ignore ratio (optional) + }, + }, + { + utils::GetFullQualifiedOpName("LayerNormalization", kOnnxDomain), + { + // Opset 1 in ONNX official does not have LayerNormalization, + // while our contrib op defined LayerNormalization in opset 1 in ONNX domain. + {1, {}}, + {17, {}}, + }, + }, + { + utils::GetFullQualifiedOpName("MatMul", kOnnxDomain), + { + {1, {}}, + {9, {}}, + {13, {}}, + }, + }, + { + utils::GetFullQualifiedOpName("FusedMatMul", kMSDomain), + { + {1, {}}, + }, + }, + { + utils::GetFullQualifiedOpName("SimplifiedLayerNormalization", kOnnxDomain), + { + // Opset 1 in ONNX official does not have SimplifiedLayerNormalization, + // while our contrib op defined SimplifiedLayerNormalization in opset 1 in ONNX domain. + {1, {}}, + }, + }, + { + utils::GetFullQualifiedOpName("SkipLayerNormalization", kMSDomain), + { + {1, {}}, + }, + }, + { + utils::GetFullQualifiedOpName("SkipSimplifiedLayerNormalization", kMSDomain), + { + {1, {}}, + }, + }, + { + utils::GetFullQualifiedOpName("Softmax", kOnnxDomain), + { + {1, {}}, + {11, {}}, + {13, {}}, + }, + }, + }); + + return recomputable_op_table_map; +} - recomputable_op_table_map.insert({probe_op_level, InlinedHashMap()}); - auto& recomputable_op_table = recomputable_op_table_map.at(probe_op_level); - if (probe_op_level >= static_cast(ProbeLevel::Basic)) { - recomputable_op_table.insert({ - { - utils::GetFullQualifiedOpName("Add", kOnnxDomain), - { - {1, {}}, - {6, {}}, - {7, {}}, - {13, {}}, - {14, {}}, - }, - }, - { - utils::GetFullQualifiedOpName("BatchNormalization", kOnnxDomain), - { - {1, {}}, - {6, {}}, - {7, {}}, - {9, {}}, - {14, {}}, - {15, {}}, - }, - }, - { - utils::GetFullQualifiedOpName("BiasGelu", kMSDomain), - { - {1, {}}, - }, - }, - { - utils::GetFullQualifiedOpName("BiasDropout", kMSDomain), - { - {1, {3, 4}}, // ignore ratio (optional) and training mode (optional) - }, - }, - { - utils::GetFullQualifiedOpName("BitmaskBiasDropout", kMSDomain), - { - {1, {3, 4}}, // ignore ratio (optional) and training mode (optional) - }, - }, - { - utils::GetFullQualifiedOpName("BitmaskDropout", kMSDomain), - { - {1, {1, 2}}, // ignore ratio (optional) and training mode (optional) - }, - }, - { - utils::GetFullQualifiedOpName("Cast", kOnnxDomain), - { - {1, {}}, - {6, {}}, - {9, {}}, - {13, {}}, - {19, {}}, - }, - }, - { - utils::GetFullQualifiedOpName("ConcatTraining", kMSDomain), - { - {1, {}}, - - }, - }, - { - utils::GetFullQualifiedOpName("ConstantOfShape", kOnnxDomain), - { - {9, {0}}, // ignore the `input`, e.g. the shape of the expected output tensor - {20, {0}}, - }, - }, - { - utils::GetFullQualifiedOpName("Cos", kOnnxDomain), - { - {7, {}}, - }, - }, - { - utils::GetFullQualifiedOpName("CumSum", kOnnxDomain), - { - // The axis input is trivial - {11, {1}}, - {14, {1}}, - }, - }, - { - utils::GetFullQualifiedOpName("Dropout", kOnnxDomain), - { - // ONNX Dropout 1, 6, 7, 10 do not have seed attribute, so we remove them from the recompute support. - {12, {1, 2}}, // ignore ratio and training_mode - {13, {1, 2}}, - }, - }, - { - utils::GetFullQualifiedOpName("Div", kOnnxDomain), - { - {1, {}}, - {6, {}}, - {7, {}}, - {13, {}}, - {14, {}}, - }, - }, - { - utils::GetFullQualifiedOpName("Einsum", kOnnxDomain), - { - {12, {}}, - }, - }, - { - utils::GetFullQualifiedOpName("Equal", kOnnxDomain), - { - {1, {}}, - {7, {}}, - {11, {}}, - {13, {}}, - {19, {}}, - }, - }, - { - utils::GetFullQualifiedOpName("Expand", kOnnxDomain), - { - {8, {1}}, // Ignore the shape. - {13, {1}}, - }, - }, - { - utils::GetFullQualifiedOpName("FastGelu", kMSDomain), - { - {1, {}}, - }, - }, - { - utils::GetFullQualifiedOpName("FlattenAndUnpad", kMSDomain), - { - {1, {1}}, // ignore the indices - }, - }, - { - utils::GetFullQualifiedOpName("Gather", kOnnxDomain), - { - {1, {1}}, // ignore the indices - {11, {1}}, - {13, {1}}, - }, - }, - { - utils::GetFullQualifiedOpName("Gelu", kOnnxDomain), - { - {20, {}}, - }, - }, - { - utils::GetFullQualifiedOpName("Gelu", kMSDomain), - { - {1, {}}, - }, - }, - { - utils::GetFullQualifiedOpName("Gemm", kOnnxDomain), - { - {1, {}}, - {6, {}}, - {7, {}}, - {9, {}}, - {11, {}}, - {13, {}}, - }, - }, - { - utils::GetFullQualifiedOpName("Less", kOnnxDomain), - { - {1, {}}, - {7, {}}, - {9, {}}, - {13, {}}, - }, - }, - { - utils::GetFullQualifiedOpName("MemcpyFromHost", kOnnxDomain), - { - {1, {0}}, // Ignore CPU input. - }, - }, - { - utils::GetFullQualifiedOpName("Mul", kOnnxDomain), - { - {1, {}}, - {6, {}}, - {7, {}}, - {13, {}}, - {14, {}}, - }, - }, - { - utils::GetFullQualifiedOpName("Neg", kOnnxDomain), - { - {1, {}}, - {6, {}}, - {13, {}}, - }, - }, - { - utils::GetFullQualifiedOpName("NonZero", kOnnxDomain), - { - {9, {}}, - {13, {}}, - }, - }, - { - utils::GetFullQualifiedOpName("PadAndUnflatten", kMSDomain), - { - {1, {1, 2}}, // ignore the indices and unflatten_dims - }, - }, - { - // Be noted, NOT all PythonOp will be allowed to recompute, there will be further check. - utils::GetFullQualifiedOpName("PythonOp", kMSDomain), - { - {1, {}}, - }, - }, - { - utils::GetFullQualifiedOpName("Range", kOnnxDomain), - { - {11, {0, 1, 2}}, // ignore start, end, delta, because they are scalars. - }, - }, - { - utils::GetFullQualifiedOpName("Reshape", kOnnxDomain), - { - {1, {}}, - {5, {}}, // ignore the shape. - {13, {}}, - {14, {}}, - {19, {}}, - }, - }, - { - utils::GetFullQualifiedOpName("Sin", kOnnxDomain), - { - {7, {}}, - }, - }, - { - utils::GetFullQualifiedOpName("Slice", kOnnxDomain), - { - {1, {}}, - {10, {1, 2, 3, 4}}, // ignore starts, ends, axes (optional) and steps (optional) - {11, {1, 2, 3, 4}}, - {13, {1, 2, 3, 4}}, - }, - }, - { - utils::GetFullQualifiedOpName("Split", kOnnxDomain), - { - {1, {1}}, // ignore split (optional) - {2, {}}, - {11, {}}, - {13, {1}}, // ignore the split (optional) - {18, {1}}, - }, - }, - { - utils::GetFullQualifiedOpName("Squeeze", kOnnxDomain), - { - {1, {}}, - {11, {}}, - {13, {1}}, // ignore the axes (optional) - }, - }, - { - utils::GetFullQualifiedOpName("Sub", kOnnxDomain), - { - {1, {}}, - {6, {}}, - {7, {}}, - {13, {}}, - {14, {}}, - }, - }, - { - utils::GetFullQualifiedOpName("Tile", kOnnxDomain), - { - {1, {1, 2}}, - {6, {1}}, - {13, {1}}, - }, - }, - { - utils::GetFullQualifiedOpName("Transpose", kOnnxDomain), - { - {1, {}}, - {13, {}}, - }, - }, - { - utils::GetFullQualifiedOpName("Trilu", kOnnxDomain), - { - {14, {1}}, // ignore k (optional) - }, - }, - { - utils::GetFullQualifiedOpName("QuickGelu", kMSDomain), - { - {1, {}}, - }, - }, - { - utils::GetFullQualifiedOpName("Unsqueeze", kOnnxDomain), - { - {1, {}}, - {11, {}}, - {13, {1}}, // ignore the axes (optional) - }, - }, - { - utils::GetFullQualifiedOpName("Where", kOnnxDomain), - { - {9, {}}, - {16, {}}, - }, - }, - - }); - } +const InlinedHashMap& GetAllowedRecomputeOps(int probe_op_level) { + static InlinedHashMap> + recomputable_op_table_map = InitializeRecomputableOpTable(); - if (probe_op_level >= static_cast(ProbeLevel::Advanced)) { - recomputable_op_table.insert({ - { - utils::GetFullQualifiedOpName("BiasSoftmax", kMSDomain), - { - {1, {}}, - }, - }, - { - utils::GetFullQualifiedOpName("BiasSoftmaxDropout", kMSDomain), - { - {1, {2}}, // ignore ratio (optional) - }, - }, - { - utils::GetFullQualifiedOpName("LayerNormalization", kOnnxDomain), - { - // Opset 1 in ONNX official does not have LayerNormalization, - // while our contrib op defined LayerNormalization in opset 1 in ONNX domain. - {1, {}}, - {17, {}}, - }, - }, - { - utils::GetFullQualifiedOpName("MatMul", kOnnxDomain), - { - {1, {}}, - {9, {}}, - {13, {}}, - }, - }, - { - utils::GetFullQualifiedOpName("FusedMatMul", kMSDomain), - { - {1, {}}, - }, - }, - { - utils::GetFullQualifiedOpName("SimplifiedLayerNormalization", kOnnxDomain), - { - // Opset 1 in ONNX official does not have SimplifiedLayerNormalization, - // while our contrib op defined SimplifiedLayerNormalization in opset 1 in ONNX domain. - {1, {}}, - }, - }, - { - utils::GetFullQualifiedOpName("SkipLayerNormalization", kMSDomain), - { - {1, {}}, - }, - }, - { - utils::GetFullQualifiedOpName("SkipSimplifiedLayerNormalization", kMSDomain), - { - {1, {}}, - }, - }, - { - utils::GetFullQualifiedOpName("Softmax", kOnnxDomain), - { - {1, {}}, - {11, {}}, - {13, {}}, - }, - }, - }); - } + ORT_ENFORCE(recomputable_op_table_map.find(probe_op_level) != recomputable_op_table_map.end(), + "Cannot get recomputable op table, probe level: ", probe_op_level); - return recomputable_op_table; + return recomputable_op_table_map.at(probe_op_level); } /** diff --git a/orttraining/orttraining/test/training_api/core/training_api_tests.cc b/orttraining/orttraining/test/training_api/core/training_api_tests.cc index 90c97eed0c6d3..be25eefb201da 100644 --- a/orttraining/orttraining/test/training_api/core/training_api_tests.cc +++ b/orttraining/orttraining/test/training_api/core/training_api_tests.cc @@ -542,8 +542,9 @@ TEST(TrainingApiTest, OptimStep) { std::string param_name = "fc2.weight"; // before training, check if optim state is initialized to 0 onnxruntime::training::api::OptimizerCheckpointState& optimizer_states = state.optimizer_checkpoint_state; + std::shared_ptr group0_states = optimizer_states.group_named_optimizer_states["group0"]; onnxruntime::training::api::ParameterOptimizerState& param_state = - optimizer_states.group_named_optimizer_states["group0"]->param_named_optimizer_states.at(param_name); + group0_states->param_named_optimizer_states.at(param_name); OrtValue& moment_1 = param_state.at("momentum0"); std::vector param_vec_before_optimizer_step; diff --git a/orttraining/orttraining/training_api/checkpoint.cc b/orttraining/orttraining/training_api/checkpoint.cc index 56029b34c24d7..cbff1891b8c84 100644 --- a/orttraining/orttraining/training_api/checkpoint.cc +++ b/orttraining/orttraining/training_api/checkpoint.cc @@ -449,7 +449,7 @@ Status FromOptimizerState(const OptimizerCheckpointState& optimizer_state, fbs_optimizer_groups.reserve(optimizer_state.group_named_optimizer_states.size()); for (const auto& group_name : SortedKeys(optimizer_state.group_named_optimizer_states)) { - const std::shared_ptr& group_optimizer_state_ptr = + std::shared_ptr group_optimizer_state_ptr = optimizer_state.group_named_optimizer_states.at(group_name); std::vector> optimizer_states; From ae3ec2e9ac1f1a1dde23407051d409fc8b52e639 Mon Sep 17 00:00:00 2001 From: Justin Chu Date: Wed, 24 Jul 2024 17:48:22 -0700 Subject: [PATCH 14/57] Ignore ruff rule `N813` (#21477) Allow importing camelcase names in lowercase --- .../test/python/onnxruntime_test_python_backend_mlops.py | 3 +-- pyproject.toml | 1 + 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/onnxruntime/test/python/onnxruntime_test_python_backend_mlops.py b/onnxruntime/test/python/onnxruntime_test_python_backend_mlops.py index b5400b487cfc2..c245699e211d4 100644 --- a/onnxruntime/test/python/onnxruntime_test_python_backend_mlops.py +++ b/onnxruntime/test/python/onnxruntime_test_python_backend_mlops.py @@ -1,7 +1,6 @@ # Copyright (c) Microsoft Corporation. All rights reserved. # Licensed under the MIT License. -# -*- coding: UTF-8 -*- import unittest import numpy as np @@ -10,7 +9,7 @@ import onnxruntime.backend as backend from onnxruntime import datasets -from onnxruntime.backend.backend import OnnxRuntimeBackend as ort_backend # noqa: N813 +from onnxruntime.backend.backend import OnnxRuntimeBackend as ort_backend def check_list_of_map_to_float(testcase, expected_rows, actual_rows): diff --git a/pyproject.toml b/pyproject.toml index 1c3a719fb544a..6429df2722b2d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -77,6 +77,7 @@ ignore = [ "G004", # FIXME: Enable when the rule can be autofixed "N803", # Argument casing "N812", # Allow import torch.nn.functional as F + "N813", # Allow importing camelcase names in lowercase "N999", # Module names "NPY002", # np.random.Generator may not always fit our use cases "PERF203", # "try-except-in-loop" only affects Python <3.11, and the improvement is minor; can have false positives From ca47f0fdd33ab267d8066edd6441ac4090bbe4aa Mon Sep 17 00:00:00 2001 From: Preetha Veeramalai Date: Wed, 24 Jul 2024 23:45:31 -0700 Subject: [PATCH 15/57] OVEP - PR 1.19 (#21443) ### Description Add OVEP features for 1.19 The PR has, - Added support for EpCtx with ORT Session options for optimized performance. - Added bug fixes - Support for OV 2024.3 --------- Co-authored-by: ubuntu Co-authored-by: vthaniel Co-authored-by: sfatimar Co-authored-by: saurabhkale17 Co-authored-by: Maheshkar --- cmake/onnxruntime_providers_openvino.cmake | 4 +- docs/python/ReadMeOV.rst | 8 ++- .../providers/openvino/backend_manager.cc | 45 ++++++++++----- .../openvino/backends/basic_backend.cc | 28 ++++++---- .../openvino/backends/basic_backend.h | 2 +- .../openvino/onnx_ctx_model_helper.cc | 14 ++--- .../openvino/onnx_ctx_model_helper.h | 3 +- .../openvino/openvino_execution_provider.cc | 3 +- .../openvino/openvino_execution_provider.h | 55 ++++++++++++++----- .../openvino/openvino_provider_factory.cc | 50 ++++++++++++++--- .../core/providers/openvino/ov_interface.cc | 43 +++++++-------- .../core/providers/openvino/ov_interface.h | 21 ++++--- .../openvino/ov_versions/capability.cc | 14 ++--- .../openvino/ov_versions/data_ops.cc | 10 ++-- .../providers/openvino/ov_versions/data_ops.h | 4 +- .../qdq_transformations/qdq_stripping.cc | 36 +++++++++--- .../core/session/provider_bridge_ort.cc | 21 ++++++- .../test/perftest/command_args_parser.cc | 1 - onnxruntime/test/perftest/ort_test_session.cc | 4 ++ onnxruntime/test/providers/checkers.cc | 20 ++++++- .../providers/cpu/rnn/deep_cpu_gru_op_test.cc | 6 +- 21 files changed, 271 insertions(+), 121 deletions(-) diff --git a/cmake/onnxruntime_providers_openvino.cmake b/cmake/onnxruntime_providers_openvino.cmake index d738e29101cfe..5d1a481d40abc 100644 --- a/cmake/onnxruntime_providers_openvino.cmake +++ b/cmake/onnxruntime_providers_openvino.cmake @@ -17,8 +17,8 @@ # Header paths find_package(OpenVINO REQUIRED COMPONENTS Runtime ONNX) - if(OpenVINO_VERSION VERSION_LESS 2023.0) - message(FATAL_ERROR "OpenVINO 2023.0 and newer are supported. Please, latest OpenVINO release") + if(OpenVINO_VERSION VERSION_LESS 2024.0) + message(FATAL_ERROR "OpenVINO 2024.0 and newer are supported. Please, use latest OpenVINO release") endif() if (WIN32) diff --git a/docs/python/ReadMeOV.rst b/docs/python/ReadMeOV.rst index 6ef16e1378139..86914699bbf6d 100644 --- a/docs/python/ReadMeOV.rst +++ b/docs/python/ReadMeOV.rst @@ -7,6 +7,7 @@ OpenVINO™ Execution Provider for ONNX Runtime accelerates inference across man - Intel® CPUs - Intel® integrated GPUs - Intel® discrete GPUs + - Intel® integrated NPUs (Windows only) Installation ------------ @@ -15,26 +16,27 @@ Requirements ^^^^^^^^^^^^ - Ubuntu 18.04, 20.04, RHEL(CPU only) or Windows 10 - 64 bit -- Python 3.8 or 3.9 or 3.10 for Linux and only Python3.10 for Windows +- Python 3.9 or 3.10 or 3.11 for Linux and Python 3.10, 3.11 for Windows This package supports: - Intel® CPUs - Intel® integrated GPUs - Intel® discrete GPUs + - Intel® integrated NPUs (Windows only) ``pip3 install onnxruntime-openvino`` Please install OpenVINO™ PyPi Package separately for Windows. For installation instructions on Windows please refer to `OpenVINO™ Execution Provider for ONNX Runtime for Windows `_. -**OpenVINO™ Execution Provider for ONNX Runtime** Linux Wheels comes with pre-built libraries of OpenVINO™ version 2023.0.0 eliminating the need to install OpenVINO™ separately. The OpenVINO™ libraries are prebuilt with CXX11_ABI flag set to 0. +**OpenVINO™ Execution Provider for ONNX Runtime** Linux Wheels comes with pre-built libraries of OpenVINO™ version 2024.1.0 eliminating the need to install OpenVINO™ separately. For more details on build and installation please refer to `Build `_. Usage ^^^^^ -By default, Intel® CPU is used to run inference. However, you can change the default option to either Intel® integrated or discrete GPU. +By default, Intel® CPU is used to run inference. However, you can change the default option to either Intel® integrated GPU, discrete GPU, integrated NPU (Windows only). Invoke `the provider config device type argument `_ to change the hardware on which inferencing is done. For more API calls and environment variables, see `Usage `_. diff --git a/onnxruntime/core/providers/openvino/backend_manager.cc b/onnxruntime/core/providers/openvino/backend_manager.cc index 1c027e39fa5f5..8f3658df0d09d 100644 --- a/onnxruntime/core/providers/openvino/backend_manager.cc +++ b/onnxruntime/core/providers/openvino/backend_manager.cc @@ -28,9 +28,8 @@ BackendManager::BackendManager(const GlobalContext& global_context, const onnxruntime::Node& fused_node, const onnxruntime::GraphViewer& subgraph, const logging::Logger& logger, - EPCtxHandler& ctx_handle) { + EPCtxHandler& ep_ctx_handle_) { global_context_ = global_context; - ep_ctx_handle_ = ctx_handle; openvino_sdk_version_ = std::to_string(global_context_.OpenVINO_Version.at(0)) + "." + std::to_string(global_context_.OpenVINO_Version.at(1)); @@ -147,13 +146,20 @@ Status BackendManager::ExportCompiledBlobAsEPCtxNode(const onnxruntime::GraphVie std::string model_blob_str; auto compiled_model = concrete_backend_->GetOVCompiledModel(); - auto graph_name = global_context_.onnx_model_path_name; - // Remove extension so we can append suffix to form the complete name of output graph - graph_name = [&]() { - size_t dot = graph_name.find_last_of("."); - if (dot == std::string::npos) return graph_name; - return graph_name.substr(0, dot); - }(); + std::string graph_name = ""; + // Epctx file path from SO is mapped to cache_dir variable for OVEP for readability + if (global_context_.cache_dir != "") { + graph_name = global_context_.cache_dir; + } else { + graph_name = global_context_.onnx_model_path_name; + // Remove extension so we can append suffix to form the complete name of output graph + graph_name = [&]() { + size_t dot = graph_name.find_last_of("."); + if (dot == std::string::npos) return graph_name; + return graph_name.substr(0, dot); + }(); + graph_name = graph_name + "-ov_" + GetGlobalContext().device_type + "_blob.onnx"; + } // If embed_mode, then pass on the serialized blob // If not embed_mode, dump the blob here and only pass on the path to the blob if (global_context_.ep_context_embed_mode) { @@ -162,9 +168,19 @@ Status BackendManager::ExportCompiledBlobAsEPCtxNode(const onnxruntime::GraphVie model_blob_str = model_blob_stream.str(); ORT_ENFORCE(model_blob_str.size() != 0); } else { - std::ofstream f(graph_name + ".blob", std::ios::out | std::ios::trunc | std::ios::binary); - compiled_model.export_model(f); - model_blob_str = graph_name + ".blob"; + // Remove extension so we can append suffix to form the complete name of output graph + auto blob_name = [&]() { + size_t dot = graph_name.find_last_of("."); + if (dot == std::string::npos) return graph_name; + return graph_name.substr(0, dot); + }(); + std::ofstream blob_file(blob_name + ".blob", + std::ios::out | std::ios::trunc | std::ios::binary); + if (!blob_file) { + ORT_THROW("Unable to open file for epctx model dump."); + } + compiled_model.export_model(blob_file); + model_blob_str = blob_name + ".blob"; } ORT_RETURN_IF_ERROR(ep_ctx_handle_.ExportEPCtxModel(graph_body_viewer, @@ -172,8 +188,7 @@ Status BackendManager::ExportCompiledBlobAsEPCtxNode(const onnxruntime::GraphVie logger, global_context_.ep_context_embed_mode, model_blob_str, - openvino_sdk_version_, - GetGlobalContext().device_type)); + openvino_sdk_version_)); return Status::OK(); } @@ -248,7 +263,7 @@ static void DumpOpenVINOEPModel(std::string onnx_model_path_name, ONNX_NAMESPACE::ModelProto* model_proto, const onnxruntime::Node& fused_node) { if (openvino_ep::backend_utils::IsDebugEnabled()) { - auto model_name = onnx_model_path_name.empty() ? "unknown.onnx" : onnx_model_path_name; + auto model_name = onnx_model_path_name.empty() ? "unknown.onnx" : std::move(onnx_model_path_name); #ifdef _WIN32 size_t slash = model_name.find_last_of("\\"); #else diff --git a/onnxruntime/core/providers/openvino/backends/basic_backend.cc b/onnxruntime/core/providers/openvino/backends/basic_backend.cc index f8046bcb3a06f..d79aa35be6418 100644 --- a/onnxruntime/core/providers/openvino/backends/basic_backend.cc +++ b/onnxruntime/core/providers/openvino/backends/basic_backend.cc @@ -37,7 +37,7 @@ BasicBackend::BasicBackend(const ONNX_NAMESPACE::ModelProto& model_proto, PopulateConfigValue(device_config); // Enable caching - EnableCaching(); + EnableCaching(device_config); // Setting OpenCL queue throttling for GPU EnableGPUThrottling(device_config); @@ -82,26 +82,28 @@ BasicBackend::BasicBackend(const ONNX_NAMESPACE::ModelProto& model_proto, ie_cnn_network_, hw_target, device_config, subgraph_context_.subgraph_name); } #else // !IO_BUFFER_ENABLED + std::string prec_str = (global_context_.precision_str != "ACCURACY") ? global_context_.precision_str : global_context_.model_precision; if (is_ep_ctx_graph_) { // If the blob is held in an EPContext node, then skip FE+Compile // and directly move on to creating a backend with the executable blob exe_network_ = global_context_.ie_core.ImportModel(ep_ctx_handle.GetModelBlobStream(), hw_target, device_config, + global_context_.ep_context_embed_mode, subgraph_context_.subgraph_name); ie_cnn_network_ = exe_network_.Get().get_runtime_model(); - } else if (!subgraph_context_.has_dynamic_input_shape) { + } else if ((!subgraph_context_.has_dynamic_input_shape) && + ((hw_target.find("AUTO") == std::string::npos) || + (global_context_.OpenVINO_Version.at(0) >= 2024 && global_context_.OpenVINO_Version.at(1) > 2))) { + // Optimized OV compile_model API is supported with AUTO from version 2024.3 and above // Inputs with static dimenstions - std::string prec_str = (global_context_.precision_str != "ACCURACY") ? global_context_.precision_str : global_context_.model_precision; const std::string model = model_proto.SerializeAsString(); exe_network_ = global_context_.ie_core.CompileModel(model, hw_target, - prec_str, - global_context_.cache_dir, device_config, subgraph_context_.subgraph_name); ie_cnn_network_ = exe_network_.Get().get_runtime_model(); - } else { // Inputs with dynamic dimensions + } else { // For all other types use ov::Model Type ie_cnn_network_ = CreateOVModel(model_proto, global_context_, const_outputs_map_); exe_network_ = global_context_.ie_core.CompileModel( ie_cnn_network_, hw_target, device_config, subgraph_context_.subgraph_name); @@ -173,13 +175,19 @@ void BasicBackend::PopulateConfigValue(ov::AnyMap& device_config) { } } -void BasicBackend::EnableCaching() { +void BasicBackend::EnableCaching(ov::AnyMap& device_config) { // cache_dir argument has no effect when working with an embed-mode EPContext Graph if (is_ep_ctx_graph_) return; - if (!global_context_.cache_dir.empty()) { + if (!global_context_.cache_dir.empty() && !global_context_.export_ep_ctx_blob) { LOGS_DEFAULT(INFO) << log_tag << "Enables Caching"; - global_context_.ie_core.SetCache(global_context_.cache_dir, global_context_.device_type); + if (global_context_.device_type.find("AUTO:GPU") != std::string::npos) { + std::pair device_property; + device_property = std::make_pair("CACHE_DIR", global_context_.cache_dir); + device_config.emplace(ov::device::properties("GPU", device_property)); + } else { + global_context_.ie_core.SetCache(global_context_.cache_dir); + } } } @@ -274,7 +282,7 @@ void BasicBackend::StartAsyncInference(Ort::KernelContext& context, OVInferReque } try { - infer_request->SetTensor(input_name, tensor_ptr); + infer_request->SetTensor(std::move(input_name), tensor_ptr); } catch (const char* msg) { ORT_THROW(msg); } diff --git a/onnxruntime/core/providers/openvino/backends/basic_backend.h b/onnxruntime/core/providers/openvino/backends/basic_backend.h index 5565223f067b8..bcd3161590ba0 100644 --- a/onnxruntime/core/providers/openvino/backends/basic_backend.h +++ b/onnxruntime/core/providers/openvino/backends/basic_backend.h @@ -37,7 +37,7 @@ class BasicBackend : public IBackend { void PopulateCompiledDirectory(std::string, std::string&, std::string&, bool&); bool ValidateSubgraph(std::map>& const_outputs_map); void PopulateConfigValue(ov::AnyMap& device_config); - void EnableCaching(); + void EnableCaching(ov::AnyMap& device_config); void EnableGPUThrottling(ov::AnyMap& device_config); void EnableStreams(); void SetNumThreads(ov::AnyMap& device_config); diff --git a/onnxruntime/core/providers/openvino/onnx_ctx_model_helper.cc b/onnxruntime/core/providers/openvino/onnx_ctx_model_helper.cc index cd1ae6150e1da..e2df9c83f15ae 100644 --- a/onnxruntime/core/providers/openvino/onnx_ctx_model_helper.cc +++ b/onnxruntime/core/providers/openvino/onnx_ctx_model_helper.cc @@ -19,8 +19,7 @@ Status EPCtxHandler::ExportEPCtxModel(const GraphViewer& graph_viewer, const logging::Logger& logger, const bool& ep_context_embed_mode, const std::string& model_blob_str, - const std::string& openvino_sdk_version, - const std::string& device_type) const { + const std::string& openvino_sdk_version) const { auto model_build = graph_viewer.CreateModel(logger); auto& graph_build = model_build->MainGraph(); @@ -77,9 +76,12 @@ Status EPCtxHandler::ExportEPCtxModel(const GraphViewer& graph_viewer, model_proto->set_ir_version(ONNX_NAMESPACE::Version::IR_VERSION); // Finally, dump the model - std::ofstream dump(graph_name + "-ov_" + device_type + "_blob.onnx", - std::ios::out | std::ios::trunc | std::ios::binary); - model_proto->SerializeToOstream(dump); + std::ofstream epctx_onnx_model(graph_name, + std::ios::out | std::ios::trunc | std::ios::binary); + if (!epctx_onnx_model) { + ORT_THROW("Unable to create epctx onnx model file "); + } + model_proto->SerializeToOstream(epctx_onnx_model); LOGS_DEFAULT(VERBOSE) << "[OpenVINO EP] Export blob as EPContext Node"; @@ -90,9 +92,7 @@ Status EPCtxHandler::ImportBlobFromEPCtxModel(const GraphViewer& graph_viewer) { auto node = graph_viewer.GetNode(0); auto& attrs = node->GetAttributes(); ORT_ENFORCE(attrs.count(EP_CACHE_CONTEXT) > 0); - model_stream_ = std::make_shared(attrs.at(EP_CACHE_CONTEXT).s()); - LOGS_DEFAULT(VERBOSE) << "[OpenVINO EP] Read blob from EPContext Node"; is_valid_ep_ctx_graph_ = true; diff --git a/onnxruntime/core/providers/openvino/onnx_ctx_model_helper.h b/onnxruntime/core/providers/openvino/onnx_ctx_model_helper.h index b2b9b5bc53d44..610e9fd49c901 100644 --- a/onnxruntime/core/providers/openvino/onnx_ctx_model_helper.h +++ b/onnxruntime/core/providers/openvino/onnx_ctx_model_helper.h @@ -29,8 +29,7 @@ class EPCtxHandler { const logging::Logger& logger, const bool& ep_context_embed_mode, const std::string& model_blob_str, - const std::string& openvino_sdk_version, - const std::string& device_type) const; + const std::string& openvino_sdk_version) const; Status ImportBlobFromEPCtxModel(const GraphViewer& graph_viewer); bool CheckForOVEPCtxNode(const GraphViewer& graph_viewer, std::string openvino_sdk_version) const; bool IsValidOVEPCtxGraph() const { return is_valid_ep_ctx_graph_; } diff --git a/onnxruntime/core/providers/openvino/openvino_execution_provider.cc b/onnxruntime/core/providers/openvino/openvino_execution_provider.cc index 655e1b180388b..5627cb2c122fb 100644 --- a/onnxruntime/core/providers/openvino/openvino_execution_provider.cc +++ b/onnxruntime/core/providers/openvino/openvino_execution_provider.cc @@ -34,6 +34,7 @@ OpenVINOExecutionProvider::OpenVINOExecutionProvider(const OpenVINOExecutionProv global_context_->export_ep_ctx_blob = info.export_ep_ctx_blob_; global_context_->enable_qdq_optimizer = info.enable_qdq_optimizer_; global_context_->disable_cpu_fallback = info.disable_cpu_fallback_; + global_context_->ep_context_embed_mode = info.so_epctx_embed_mode_; // to check if target device is available // using ie_core capability GetAvailableDevices to fetch list of devices plugged in @@ -47,7 +48,7 @@ OpenVINOExecutionProvider::OpenVINOExecutionProvider(const OpenVINOExecutionProv info.device_type_.find("AUTO") != std::string::npos) { device_found = true; } else { - for (std::string device : available_devices) { + for (const std::string& device : available_devices) { if (device.rfind(info.device_type_, 0) == 0) { if (info.device_type_.find("GPU") != std::string::npos && (info.precision_ == "FP32" || info.precision_ == "FP16" || diff --git a/onnxruntime/core/providers/openvino/openvino_execution_provider.h b/onnxruntime/core/providers/openvino/openvino_execution_provider.h index 050fb91c51771..030e5bba71b67 100644 --- a/onnxruntime/core/providers/openvino/openvino_execution_provider.h +++ b/onnxruntime/core/providers/openvino/openvino_execution_provider.h @@ -16,16 +16,23 @@ namespace onnxruntime { +struct OVDevices { + ov::Core core; + std::vector get_ov_devices() const { + return core.get_available_devices(); + } +}; + static void print_build_options() { std::cout << "[ERROR] INVALID DEVICE BUILD TYPE SPECIFIED" << std::endl; std::cout << "Specify the keyword HETERO (or) MULTI (or) AUTO followed by the devices in the order of priority " << "you want to build" << std::endl; std::cout << "The different hardware devices that can be added with HETERO/MULTI/AUTO build " - << "are ['CPU','GPU','NPU']" + << "are ['CPU','GPU','NPU','GPU.x'] where x = 0,1,2 and so on" << std::endl; std::cout << "An example of how to specify the HETERO or MULTI or AUTO build type. " - << "Ex: HETERO:GPU,CPU Ex: MULTI:GPU,CPU Ex: AUTO:GPU,CPU" + << "Ex: HETERO:GPU,CPU Ex: MULTI:GPU,CPU Ex: AUTO:GPU,CPU Ex: AUTO:GPU.0,CPU Ex: AUTO:GPU.1,CPU" << std::endl; } @@ -40,7 +47,8 @@ static std::vector split(const std::string& s, char delim) { return result; } -static std::vector parseDevices(const std::string& device_string) { +static std::vector parseDevices(const std::string& device_string, + const std::vector& available_devices) { std::string comma_separated_devices = device_string; if (comma_separated_devices.find(":") != std::string::npos) { comma_separated_devices = comma_separated_devices.substr(comma_separated_devices.find(":") + 1); @@ -50,8 +58,15 @@ static std::vector parseDevices(const std::string& device_string) { print_build_options(); ORT_THROW("Invalid device string: " + device_string); } - std::vector dev_options = {"CPU", "GPU", "NPU"}; - for (std::string dev : devices) { + std::set dev_options = {"CPU", "GPU", "NPU"}; + + for (auto& device : available_devices) { + if (dev_options.find(device) == dev_options.end()) { + auto dev_options_update = dev_options.emplace(device); + } + } + + for (const std::string& dev : devices) { if (!std::count(dev_options.begin(), dev_options.end(), dev)) { print_build_options(); ORT_THROW("Invalid device string: " + device_string); @@ -75,28 +90,42 @@ struct OpenVINOExecutionProviderInfo { bool export_ep_ctx_blob_{false}; bool enable_qdq_optimizer_{false}; bool disable_cpu_fallback_{false}; + bool so_epctx_embed_mode_{true}; OpenVINOExecutionProviderInfo() = delete; - explicit OpenVINOExecutionProviderInfo(std::string dev_type, std::string precision, bool enable_npu_fast_compile, - size_t num_of_threads, std::string cache_dir, std::string model_priority, + explicit OpenVINOExecutionProviderInfo(const std::string& dev_type, const std::string& precision, + bool enable_npu_fast_compile, size_t num_of_threads, + const std::string& cache_dir, const std::string& model_priority, int num_streams, void* context, bool enable_opencl_throttling, bool disable_dynamic_shapes, bool export_ep_ctx_blob, - bool enable_qdq_optimizer, bool disable_cpu_fallback) - : precision_(precision), + bool enable_qdq_optimizer, bool disable_cpu_fallback, + bool so_epctx_embed_mode) + : precision_(std::move(precision)), enable_npu_fast_compile_(enable_npu_fast_compile), num_of_threads_(num_of_threads), cache_dir_(std::move(cache_dir)), - model_priority_(model_priority), + model_priority_(std::move(model_priority)), num_streams_(num_streams), context_(context), enable_opencl_throttling_(enable_opencl_throttling), disable_dynamic_shapes_(disable_dynamic_shapes), export_ep_ctx_blob_(export_ep_ctx_blob), enable_qdq_optimizer_(enable_qdq_optimizer), - disable_cpu_fallback_(disable_cpu_fallback) { + disable_cpu_fallback_(disable_cpu_fallback), + so_epctx_embed_mode_{so_epctx_embed_mode} { std::set ov_supported_device_types = {"CPU", "GPU", "GPU.0", "GPU.1", "NPU"}; + + OVDevices devices; + std::vector available_devices = devices.get_ov_devices(); + + for (auto& device : available_devices) { + if (ov_supported_device_types.find(device) == ov_supported_device_types.end()) { + ov_supported_device_types.emplace(device); + } + } + if (dev_type == "") { LOGS_DEFAULT(INFO) << "[OpenVINO-EP]" << "No runtime device selection option provided."; @@ -116,7 +145,7 @@ struct OpenVINOExecutionProviderInfo { dev_type = DEVICE; if (dev_type.find("HETERO") == 0 || dev_type.find("MULTI") == 0 || dev_type.find("AUTO") == 0) { - std::vector devices = parseDevices(dev_type); + std::vector devices = parseDevices(dev_type, available_devices); precision_ = "FP16"; if (devices[0] == "CPU") { precision_ = "FP32"; @@ -127,7 +156,7 @@ struct OpenVINOExecutionProviderInfo { } else if (ov_supported_device_types.find(dev_type) != ov_supported_device_types.end()) { device_type_ = std::move(dev_type); } else if (dev_type.find("HETERO") == 0 || dev_type.find("MULTI") == 0 || dev_type.find("AUTO") == 0) { - std::vector devices = parseDevices(dev_type); + std::vector devices = parseDevices(dev_type, available_devices); device_type_ = dev_type; } else { ORT_THROW("Invalid device string: " + dev_type); diff --git a/onnxruntime/core/providers/openvino/openvino_provider_factory.cc b/onnxruntime/core/providers/openvino/openvino_provider_factory.cc index 45bba431741c5..716a7cd936405 100644 --- a/onnxruntime/core/providers/openvino/openvino_provider_factory.cc +++ b/onnxruntime/core/providers/openvino/openvino_provider_factory.cc @@ -14,7 +14,8 @@ struct OpenVINOProviderFactory : IExecutionProviderFactory { int num_streams, void* context, bool enable_opencl_throttling, bool disable_dynamic_shapes, bool export_ep_ctx_blob, bool enable_qdq_optimizer, - bool disable_cpu_fallback) + bool disable_cpu_fallback, + bool so_epctx_embed_mode) : precision_(precision), enable_npu_fast_compile_(enable_npu_fast_compile), num_of_threads_(num_of_threads), @@ -25,10 +26,12 @@ struct OpenVINOProviderFactory : IExecutionProviderFactory { disable_dynamic_shapes_(disable_dynamic_shapes), export_ep_ctx_blob_(export_ep_ctx_blob), enable_qdq_optimizer_(enable_qdq_optimizer), - disable_cpu_fallback_(disable_cpu_fallback) { + disable_cpu_fallback_(disable_cpu_fallback), + so_epctx_embed_mode_(so_epctx_embed_mode) { device_type_ = (device_type == nullptr) ? "" : device_type; cache_dir_ = (cache_dir == nullptr) ? "" : cache_dir; } + ~OpenVINOProviderFactory() override { } @@ -48,13 +51,15 @@ struct OpenVINOProviderFactory : IExecutionProviderFactory { bool export_ep_ctx_blob_; bool enable_qdq_optimizer_; bool disable_cpu_fallback_; + bool so_epctx_embed_mode_; }; std::unique_ptr OpenVINOProviderFactory::CreateProvider() { OpenVINOExecutionProviderInfo info(device_type_, precision_, enable_npu_fast_compile_, num_of_threads_, cache_dir_, model_priority_, num_streams_, context_, enable_opencl_throttling_, disable_dynamic_shapes_, export_ep_ctx_blob_, enable_qdq_optimizer_, - disable_cpu_fallback_); + disable_cpu_fallback_, + so_epctx_embed_mode_); return std::make_unique(info); } @@ -105,6 +110,8 @@ struct OpenVINO_Provider : Provider { bool disable_cpu_fallback = false; + bool so_epctx_embed_mode = true; + if (provider_options_map.find("device_type") != provider_options_map.end()) { device_type = provider_options_map.at("device_type").c_str(); @@ -113,6 +120,14 @@ struct OpenVINO_Provider : Provider { std::set deprecated_device_types = {"CPU_FP32", "GPU_FP32", "GPU.0_FP32", "GPU.1_FP32", "GPU_FP16", "GPU.0_FP16", "GPU.1_FP16"}; + OVDevices devices; + std::vector available_devices = devices.get_ov_devices(); + + for (auto& device : available_devices) { + if (ov_supported_device_types.find(device) == ov_supported_device_types.end()) { + ov_supported_device_types.emplace(device); + } + } if (deprecated_device_types.find(device_type) != deprecated_device_types.end()) { std::string deprecated_device = device_type; int delimit = device_type.find("_"); @@ -128,8 +143,8 @@ struct OpenVINO_Provider : Provider { (device_type.find("MULTI:") == 0) || (device_type.find("AUTO:") == 0))) { ORT_THROW( - "[ERROR] [OpenVINO] You have selcted wrong configuration value for the key 'device_type'. " - "Select from 'CPU', 'GPU', 'GPU.0', 'GPU.1', 'NPU' or from" + "[ERROR] [OpenVINO] You have selected wrong configuration value for the key 'device_type'. " + "Select from 'CPU', 'GPU', 'NPU', 'GPU.x' where x = 0,1,2 and so on or from" " HETERO/MULTI/AUTO options available. \n"); } } @@ -253,9 +268,8 @@ struct OpenVINO_Provider : Provider { } } } - - if (provider_options_map.find("export_ep_ctx_blob") != provider_options_map.end()) { - bool_flag = provider_options_map.at("export_ep_ctx_blob"); + if (provider_options_map.find("so_export_ep_ctx_blob") != provider_options_map.end()) { + bool_flag = provider_options_map.at("so_export_ep_ctx_blob"); if (bool_flag == "true" || bool_flag == "True") export_ep_ctx_blob = true; else if (bool_flag == "false" || bool_flag == "False") @@ -271,6 +285,23 @@ struct OpenVINO_Provider : Provider { disable_cpu_fallback = false; bool_flag = ""; } + if (provider_options_map.find("so_epctx_embed_mode") != provider_options_map.end()) { + bool_flag = provider_options_map.at("so_epctx_embed_mode"); + if (bool_flag == "true" || bool_flag == "True") + so_epctx_embed_mode = true; + else if (bool_flag == "false" || bool_flag == "False") + so_epctx_embed_mode = false; + bool_flag = ""; + } + + if (provider_options_map.find("so_epctx_path") != provider_options_map.end()) { + // The path to dump epctx model is valid only when epctx is enabled. + // Overrides the cache_dir option to dump model cache files from OV. + if (export_ep_ctx_blob) { + cache_dir = provider_options_map.at("so_epctx_path").c_str(); + } + } + return std::make_shared(const_cast(device_type.c_str()), const_cast(precision.c_str()), enable_npu_fast_compile, @@ -283,7 +314,8 @@ struct OpenVINO_Provider : Provider { disable_dynamic_shapes, export_ep_ctx_blob, enable_qdq_optimizer, - disable_cpu_fallback); + disable_cpu_fallback, + so_epctx_embed_mode); } void Initialize() override { diff --git a/onnxruntime/core/providers/openvino/ov_interface.cc b/onnxruntime/core/providers/openvino/ov_interface.cc index 8dd00857b7dd0..7e8681d304abf 100644 --- a/onnxruntime/core/providers/openvino/ov_interface.cc +++ b/onnxruntime/core/providers/openvino/ov_interface.cc @@ -63,7 +63,6 @@ std::shared_ptr OVCore::ReadModel(const std::string& model, const std return FE->convert(inputModel); } else { ORT_THROW(log_tag + "[OpenVINO-EP] Unknown exception while Reading network"); - return NULL; } } catch (const Exception& e) { ORT_THROW(log_tag + "[OpenVINO-EP] Exception while Reading network: " + std::string(e.what())); @@ -73,9 +72,9 @@ std::shared_ptr OVCore::ReadModel(const std::string& model, const std } OVExeNetwork OVCore::CompileModel(std::shared_ptr& ie_cnn_network, - std::string hw_target, - const ov::AnyMap& device_config, - std::string name) { + std::string& hw_target, + ov::AnyMap& device_config, + const std::string& name) { ov::CompiledModel obj; try { obj = oe.compile_model(ie_cnn_network, hw_target, device_config); @@ -92,22 +91,12 @@ OVExeNetwork OVCore::CompileModel(std::shared_ptr& ie_cnn_netwo } OVExeNetwork OVCore::CompileModel(const std::string& onnx_model, - std::string hw_target, - std::string precision, - std::string cache_dir, - const ov::AnyMap& device_config, - std::string name) { + std::string& hw_target, + ov::AnyMap& device_config, + const std::string& name) { ov::CompiledModel obj; try { - if (hw_target == "AUTO:GPU,CPU") { - obj = oe.compile_model(onnx_model, ov::Tensor(), - "AUTO", - ov::device::priorities("GPU", "CPU"), - ov::device::properties("GPU", {ov::cache_dir(cache_dir), - ov::hint::inference_precision(precision)})); - } else { - obj = oe.compile_model(onnx_model, ov::Tensor(), hw_target, device_config); - } + obj = oe.compile_model(onnx_model, ov::Tensor(), hw_target, device_config); #ifndef NDEBUG printDebugInfo(obj); #endif @@ -123,9 +112,19 @@ OVExeNetwork OVCore::CompileModel(const std::string& onnx_model, OVExeNetwork OVCore::ImportModel(std::shared_ptr model_stream, std::string hw_target, const ov::AnyMap& device_config, + bool embed_mode, std::string name) { try { - auto obj = oe.import_model(*model_stream, hw_target, device_config); + ov::CompiledModel obj; + if (embed_mode) { + obj = oe.import_model(*model_stream, hw_target, device_config); + } else { + std::string blob_file_path = (*model_stream).str(); + std::ifstream modelStream(blob_file_path, std::ios_base::binary | std::ios_base::in); + obj = oe.import_model(modelStream, + hw_target, + {}); + } #ifndef NDEBUG printDebugInfo(obj); #endif @@ -138,10 +137,8 @@ OVExeNetwork OVCore::ImportModel(std::shared_ptr model_strea } } -void OVCore::SetCache(std::string cache_dir_path, std::string device_type) { - if (device_type != "AUTO:GPU,CPU") { - oe.set_property(ov::cache_dir(cache_dir_path)); - } +void OVCore::SetCache(const std::string& cache_dir_path) { + oe.set_property(ov::cache_dir(cache_dir_path)); } #ifdef IO_BUFFER_ENABLED diff --git a/onnxruntime/core/providers/openvino/ov_interface.h b/onnxruntime/core/providers/openvino/ov_interface.h index af6f252feb2ce..fa22e0f3cb03d 100644 --- a/onnxruntime/core/providers/openvino/ov_interface.h +++ b/onnxruntime/core/providers/openvino/ov_interface.h @@ -40,20 +40,23 @@ class OVCore { ov::Core oe; public: + // OV Interface For Reading Model std::shared_ptr ReadModel(const std::string& model_stream, const std::string& model_path) const; + // OV Interface for Compiling OV Model Type OVExeNetwork CompileModel(std::shared_ptr& ie_cnn_network, - std::string hw_target, - const ov::AnyMap& device_config, - std::string name); + std::string& hw_target, + ov::AnyMap& device_config, + const std::string& name); + // OV Interface for Fast Compile OVExeNetwork CompileModel(const std::string& onnx_model, - std::string hw_target, - std::string precision, - std::string cache_dir, - const ov::AnyMap& device_config, - std::string name); + std::string& hw_target, + ov::AnyMap& device_config, + const std::string& name); + // OV Interface for Import model Stream OVExeNetwork ImportModel(std::shared_ptr model_stream, std::string hw_target, const ov::AnyMap& device_config, + bool embed_mode, std::string name); #ifdef IO_BUFFER_ENABLED OVExeNetwork CompileModel(std::shared_ptr& model, @@ -64,7 +67,7 @@ class OVCore { std::string name); #endif std::vector GetAvailableDevices(); - void SetCache(std::string cache_dir_path, std::string device_type); + void SetCache(const std::string& cache_dir_path); ov::Core& Get() { return oe; } void SetStreams(const std::string& device_type, int num_streams); }; diff --git a/onnxruntime/core/providers/openvino/ov_versions/capability.cc b/onnxruntime/core/providers/openvino/ov_versions/capability.cc index 856b97a0896db..3fcaff4369c89 100644 --- a/onnxruntime/core/providers/openvino/ov_versions/capability.cc +++ b/onnxruntime/core/providers/openvino/ov_versions/capability.cc @@ -35,18 +35,16 @@ GetCapability::GetCapability(const GraphViewer& graph_viewer_param, device_type_ = "CPU"; if (enable_qdq_optimizer) npu_qdq_optimizer_enabled = true; } -#if OPENVINO_VERSION_MAJOR == 2023 && OPENVINO_VERSION_MINOR == 1 - data_ops_ = new DataOps(graph_viewer_, V_2023_1, device_type_, npu_qdq_optimizer_enabled); -#elif OPENVINO_VERSION_MAJOR == 2023 && OPENVINO_VERSION_MINOR == 2 - data_ops_ = new DataOps(graph_viewer_, V_2023_2, device_type_, npu_qdq_optimizer_enabled); -#elif OPENVINO_VERSION_MAJOR == 2023 && OPENVINO_VERSION_MINOR == 3 - data_ops_ = new DataOps(graph_viewer_, V_2023_3, device_type_, npu_qdq_optimizer_enabled); -#elif OPENVINO_VERSION_MAJOR == 2024 && OPENVINO_VERSION_MINOR == 0 +#if OPENVINO_VERSION_MAJOR == 2024 && OPENVINO_VERSION_MINOR == 0 data_ops_ = new DataOps(graph_viewer_, V_2024_0, device_type_, npu_qdq_optimizer_enabled); #elif OPENVINO_VERSION_MAJOR == 2024 && OPENVINO_VERSION_MINOR == 1 data_ops_ = new DataOps(graph_viewer_, V_2024_1, device_type_, npu_qdq_optimizer_enabled); +#elif OPENVINO_VERSION_MAJOR == 2024 && OPENVINO_VERSION_MINOR == 2 + data_ops_ = new DataOps(graph_viewer_, V_2024_2, device_type_, npu_qdq_optimizer_enabled); +#elif OPENVINO_VERSION_MAJOR == 2024 && OPENVINO_VERSION_MINOR == 3 + data_ops_ = new DataOps(graph_viewer_, V_2024_3, device_type_, npu_qdq_optimizer_enabled); #else - data_ops_ = new DataOps(graph_viewer_, V_2024_1, device_type_, npu_qdq_optimizer_enabled); + data_ops_ = new DataOps(graph_viewer_, V_2024_3, device_type_, npu_qdq_optimizer_enabled); #endif } diff --git a/onnxruntime/core/providers/openvino/ov_versions/data_ops.cc b/onnxruntime/core/providers/openvino/ov_versions/data_ops.cc index 38c029faff9d5..d9aa13ec1bba9 100644 --- a/onnxruntime/core/providers/openvino/ov_versions/data_ops.cc +++ b/onnxruntime/core/providers/openvino/ov_versions/data_ops.cc @@ -142,6 +142,7 @@ std::vector supported_op_mode = { {"GreaterOrEqual", V_2022_1, {"CPU", "GPU"}}, {"GridSample", V_2022_3, {"CPU"}}, {"GridSample", V_2023_0, {"GPU"}}, + {"GRU", V_2024_1, {"CPU", "GPU"}}, {"HardMax", V_2023_1, {"CPU", "GPU"}}, {"Identity", V_2020_4, {"CPU", "GPU"}}, {"If", V_2022_3, {"CPU", "GPU"}}, @@ -155,6 +156,7 @@ std::vector supported_op_mode = { {"LessOrEqual", V_2022_1, {"CPU", "GPU"}}, {"Log", V_2020_4, {"CPU", "GPU"}}, {"LogSoftMax", V_2022_1, {"CPU", "GPU"}}, + {"LogSoftmax", V_2024_1, {"CPU", "GPU"}}, {"Loop", V_2021_4, {"CPU", "GPU"}}, {"LpNormalization", V_2023_1, {"CPU", "GPU"}}, {"LRN", V_2020_4, {"CPU", "GPU"}}, @@ -361,7 +363,7 @@ void DataOps::populate_op_mode_supported() { // populate unsupportedmode_t { - UnsupportedOpMode obj = {{V_2024_1}, + UnsupportedOpMode obj = {{V_2024_1, V_2024_2, V_2024_3}, [this](const Node* node, const InitializedTensorSet&) { // If the Input of ReduceMax op is UINT8, it is rejected (Due to output mismatch) for (size_t i = 0; i < node->InputDefs().size(); i++) { @@ -376,7 +378,7 @@ void DataOps::populate_op_mode_supported() { op_list_.insert({"ReduceMax", obj}); } { - UnsupportedOpMode obj = {{V_2023_1, V_2023_2, V_2023_3, V_2024_0, V_2024_1}, + UnsupportedOpMode obj = {{V_2023_1, V_2023_2, V_2023_3, V_2024_0, V_2024_1, V_2024_2, V_2024_3}, [this](const Node* node, const InitializedTensorSet&) { const auto& input_arg = node->InputDefs()[1]; auto shape = input_arg->Shape(); @@ -393,7 +395,7 @@ void DataOps::populate_op_mode_supported() { op_list_.insert({"Reshape", obj}); } { - UnsupportedOpMode obj = {{V_2023_1, V_2023_2, V_2023_3, V_2024_0, V_2024_1}, + UnsupportedOpMode obj = {{V_2023_1, V_2023_2, V_2023_3, V_2024_0, V_2024_1, V_2024_2, V_2024_3}, [this](const Node* node, const InitializedTensorSet&) { // If the operator is unsqueeze // If axes is an input, then we cannot produce a static graph. @@ -408,7 +410,7 @@ void DataOps::populate_op_mode_supported() { op_list_.insert({"Unsqueeze", obj}); } { - UnsupportedOpMode obj = {{V_2023_1, V_2023_2, V_2023_3, V_2024_0, V_2024_1}, + UnsupportedOpMode obj = {{V_2023_1, V_2023_2, V_2023_3, V_2024_0, V_2024_1, V_2024_2, V_2024_3}, [this](const Node* node, const InitializedTensorSet&) { // check for attributes auto& upsample_attr = node->GetAttributes(); diff --git a/onnxruntime/core/providers/openvino/ov_versions/data_ops.h b/onnxruntime/core/providers/openvino/ov_versions/data_ops.h index 7cfb0516b8ccf..4c064b08405c1 100644 --- a/onnxruntime/core/providers/openvino/ov_versions/data_ops.h +++ b/onnxruntime/core/providers/openvino/ov_versions/data_ops.h @@ -28,7 +28,9 @@ enum versionNum { V_2023_2, V_2023_3, V_2024_0, - V_2024_1 + V_2024_1, + V_2024_2, + V_2024_3 }; using VersionNum = enum versionNum; diff --git a/onnxruntime/core/providers/openvino/qdq_transformations/qdq_stripping.cc b/onnxruntime/core/providers/openvino/qdq_transformations/qdq_stripping.cc index c7689a0be7e73..a2b3ed068235b 100644 --- a/onnxruntime/core/providers/openvino/qdq_transformations/qdq_stripping.cc +++ b/onnxruntime/core/providers/openvino/qdq_transformations/qdq_stripping.cc @@ -205,11 +205,11 @@ static bool IsConnectedQAConstantInitializer(const Node* dq_node, const onnxrunt // Check required because in some cases, when a NodeUnit cannot be formed with this standalone DQ // we still need to check if it feeds into a supported Op -static bool DQFeedsASupportedOp(const Node* dq_node, const onnxruntime::GraphViewer& src_graph) { +static bool DQFeedsASupportedOp(const Node* dq_node) { if (!dq_node->GetOutputEdgesCount()) return false; // Only feeds the graph output, and not any node const auto& target_node = *dq_node->OutputNodesBegin(); - const auto op_type = target_node.OpType(); + const auto& op_type = target_node.OpType(); if (op_type == "Conv" || op_type == "MatMul") { // Conv and MatMul always keeps int8 DQs except if the DQ is sandwiched between Softmax and Conv/MatMul @@ -219,8 +219,8 @@ static bool DQFeedsASupportedOp(const Node* dq_node, const onnxruntime::GraphVie return true; } } else if (op_type == "Add") { - // Add keeps all DQs except if it has const inits - return !IsAnyDQAConstantInitializer(&target_node, src_graph); + // Add => keeps all DQs + return true; } return false; } @@ -291,7 +291,7 @@ static bool CheckDQRuleSet(const NodeUnit& node_unit, const onnxruntime::GraphViewer& src_graph, SkipReason& reason) { const auto& target_node = node_unit.GetNode(); - auto op_type = node_unit.OpType(); + const auto& op_type = node_unit.OpType(); // #1 Reverse DQ duplication if (dq_node->Name().find(DuplicateDQ) != std::string::npos) { @@ -337,6 +337,18 @@ static bool CheckDQRuleSet(const NodeUnit& node_unit, } } +static bool CheckQFeedsIntoQuantizedOutput(const NodeUnit& node_unit, + const std::unordered_map graph_op_data_type) { + auto op_of_quantized_layer = node_unit.Outputs(); + for (auto& itr : op_of_quantized_layer) { + auto it = graph_op_data_type.find(itr.node_arg.Name()); + if (it != graph_op_data_type.end() && it->second == "tensor(uint8)") { + return true; + } + } + return false; +} + static bool CheckQRuleSet(const NodeUnit& node_unit, const Node* q_node, const onnxruntime::GraphViewer& src_graph, @@ -345,7 +357,13 @@ static bool CheckQRuleSet(const NodeUnit& node_unit, // This Q should also be uint8 const auto& target_node = node_unit.GetNode(); - auto op_type = node_unit.OpType(); + const auto& op_type = node_unit.OpType(); + + auto op = src_graph.GetOutputs(); + std::unordered_map graph_op_data_type; + for (auto& ops : op) { + graph_op_data_type[src_graph.GetNodeArg(ops->Name())->Name()] = ops->Type()->data(); + } // If UInt16 Q, don't keep it if (GetQDQDataType(q_node) == DT_UINT16 || GetQDQDataType(q_node) == DT_INT16) { @@ -359,6 +377,8 @@ static bool CheckQRuleSet(const NodeUnit& node_unit, } else if (op_type == "Add") { // Add keeps all Qs return true; + } else if (CheckQFeedsIntoQuantizedOutput(node_unit, std::move(graph_op_data_type))) { + return true; } else { // Keep Q of an unsupported Op only if the target that succeeds it is a supported Op in this list return IsNextTargetNodeOfQValid(q_node, &target_node, src_graph, {"Conv", "Add", "MatMul"}, false); @@ -469,7 +489,7 @@ static void AddStandaloneNodeUnit(onnxruntime::Graph& dst_graph, const onnxrunti add_identity_op(true); else if (IsConnectedQPresent(src_graph, dst_graph.Nodes(), &node_unit.GetNode(), node_unit.GetNode().InputDefs())) AddNode(initializers_to_keep, src_graph, dst_graph, node_unit.GetNode()); - else if (DQFeedsASupportedOp(&node_unit.GetNode(), src_graph)) + else if (DQFeedsASupportedOp(&node_unit.GetNode())) AddNode(initializers_to_keep, src_graph, dst_graph, node_unit.GetNode()); else add_identity_op(false); @@ -543,7 +563,7 @@ static void AddQDQNodeUnit(onnxruntime::Graph& dst_graph, // Add Node args for inputs for (const auto& node_unit_input : node_unit_inputs) { - auto node_arg_name = node_unit_input.node_arg.Name(); + const auto& node_arg_name = node_unit_input.node_arg.Name(); if (auto dq_node_arg = dq_node_args_to_keep.find(node_arg_name); dq_node_arg != dq_node_args_to_keep.end()) { // Add supported DQ as an input arg for the target node input_args.push_back(dq_node_arg->second); diff --git a/onnxruntime/core/session/provider_bridge_ort.cc b/onnxruntime/core/session/provider_bridge_ort.cc index 1d21933e9cba9..924158a26b927 100644 --- a/onnxruntime/core/session/provider_bridge_ort.cc +++ b/onnxruntime/core/session/provider_bridge_ort.cc @@ -1931,12 +1931,31 @@ void ORTSessionOptionsToOrtOpenVINOProviderOptions(ProviderOptions& ov_options, kOrtSessionOptionsDisableCPUEPFallback, "0") == "1"; if (disable_cpu_fallback) ov_options["disable_cpu_fallback"] = "true"; + + // values from session options will override the providerOptions Value + bool so_epctx_enable = session_options->config_options.GetConfigOrDefault( + kOrtSessionOptionEpContextEnable, "0") == "1"; + if (so_epctx_enable) + ov_options["so_export_ep_ctx_blob"] = "true"; + + std::string so_cache_path = session_options->config_options.GetConfigOrDefault(kOrtSessionOptionEpContextFilePath, "").c_str(); + ov_options["so_epctx_path"] = so_cache_path; + + // Default embedMode is 1. Saving the compiled model contents as a Epctx node attribute + bool so_epctx_embed_mode = session_options->config_options.GetConfigOrDefault( + kOrtSessionOptionEpContextEmbedMode, "1") == "0"; + if (so_epctx_embed_mode) { + // defaults to true + ov_options["so_epctx_embed_mode"] = "false"; + } } std::shared_ptr OpenVINOProviderFactoryCreator::Create(ProviderOptions* provider_options_map, const SessionOptions* session_options) { - if (session_options) + // Append session options applicable for EP to EP Provider options. + if (session_options) { onnxruntime::ORTSessionOptionsToOrtOpenVINOProviderOptions(*provider_options_map, session_options); + } return s_library_openvino.Get().CreateExecutionProviderFactory(provider_options_map); } diff --git a/onnxruntime/test/perftest/command_args_parser.cc b/onnxruntime/test/perftest/command_args_parser.cc index e6d4e0a94abd3..84c3bc16346f3 100644 --- a/onnxruntime/test/perftest/command_args_parser.cc +++ b/onnxruntime/test/perftest/command_args_parser.cc @@ -253,7 +253,6 @@ static bool ParseSessionConfigs(const std::string& configs_string, test_config.machine_config.provider_type_name = onnxruntime::kDnnlExecutionProvider; } else if (!CompareCString(optarg, ORT_TSTR("openvino"))) { test_config.machine_config.provider_type_name = onnxruntime::kOpenVINOExecutionProvider; - test_config.run_config.optimization_level = ORT_DISABLE_ALL; } else if (!CompareCString(optarg, ORT_TSTR("tensorrt"))) { test_config.machine_config.provider_type_name = onnxruntime::kTensorrtExecutionProvider; } else if (!CompareCString(optarg, ORT_TSTR("qnn"))) { diff --git a/onnxruntime/test/perftest/ort_test_session.cc b/onnxruntime/test/perftest/ort_test_session.cc index 72b5da7aaec9b..fc1bdb10d7453 100644 --- a/onnxruntime/test/perftest/ort_test_session.cc +++ b/onnxruntime/test/perftest/ort_test_session.cc @@ -699,6 +699,10 @@ select from 'TF8', 'TF16', 'UINT8', 'FLOAT', 'ITENSOR'. \n)"); std::set deprecated_device_types = {"CPU_FP32", "GPU_FP32", "GPU.0_FP32", "GPU.1_FP32", "GPU_FP16", "GPU.0_FP16", "GPU.1_FP16"}; + size_t num_gpus = 10; + for (size_t i = 0; i <= num_gpus; i++) { + ov_supported_device_types.emplace("GPU." + std::to_string(i)); + } if (ov_supported_device_types.find(value) != ov_supported_device_types.end()) { ov_options[key] = value; } else if (deprecated_device_types.find(value) != deprecated_device_types.end()) { diff --git a/onnxruntime/test/providers/checkers.cc b/onnxruntime/test/providers/checkers.cc index d0e08448ce456..5f332ddcddb8d 100644 --- a/onnxruntime/test/providers/checkers.cc +++ b/onnxruntime/test/providers/checkers.cc @@ -25,7 +25,15 @@ struct DefaultTolerance { static constexpr float relative = 1e-5f; // Allow to have different default absolute tolerance for different providers. - static float get_absolute(const std::string& /*provider_type*/) { + static float get_absolute(const std::string& provider_type /*provider_type*/) { + if (provider_type == kOpenVINOExecutionProvider) { +#ifdef OPENVINO_CONFIG_NPU + return 0.005f; +#else + return absolute; +#endif + } + return absolute; } }; @@ -40,7 +48,15 @@ struct DefaultTolerance { static constexpr float relative = 1e-4f; - static float get_absolute(const std::string& /*provider_type*/) { + static float get_absolute(const std::string& provider_type /*provider_type*/) { + if (provider_type == kOpenVINOExecutionProvider) { +#ifdef OPENVINO_CONFIG_NPU + return 0.005f; +#else + return absolute; +#endif + } + return absolute; } }; diff --git a/onnxruntime/test/providers/cpu/rnn/deep_cpu_gru_op_test.cc b/onnxruntime/test/providers/cpu/rnn/deep_cpu_gru_op_test.cc index b05649dafc181..30960e71c577f 100644 --- a/onnxruntime/test/providers/cpu/rnn/deep_cpu_gru_op_test.cc +++ b/onnxruntime/test/providers/cpu/rnn/deep_cpu_gru_op_test.cc @@ -98,8 +98,12 @@ static void RunGruTest(const std::vector& X_data, test.AddOptionalOutputEdge(); } - // TensorRT failed on GRU tests +// TensorRT, OpenVINO failed on GRU tests +#if defined(USE_OPENVINO) + test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider, kOpenVINOExecutionProvider}); +#else test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider}); +#endif } void DefaultActivationsSimpleWeightsNoBias(std::string direction, From 6787cf18a5ee0196b376926b9a7080e925d4756d Mon Sep 17 00:00:00 2001 From: Yueqing Zhang Date: Thu, 25 Jul 2024 07:18:55 -0700 Subject: [PATCH 16/57] [VitisAI] use binary mode for context ep (#21474) ### Description We found text format could caused error. ### Motivation and Context Because the OS could change the string so we decided to save it as binary file. --- .../core/providers/vitisai/vitisai_execution_provider.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/onnxruntime/core/providers/vitisai/vitisai_execution_provider.cc b/onnxruntime/core/providers/vitisai/vitisai_execution_provider.cc index 036831df7a9cf..0f0972d96bcee 100644 --- a/onnxruntime/core/providers/vitisai/vitisai_execution_provider.cc +++ b/onnxruntime/core/providers/vitisai/vitisai_execution_provider.cc @@ -100,7 +100,7 @@ void VitisAIExecutionProvider::FulfillEPContextEnablement( auto& ep_ctx_graph = p_ep_ctx_model_->MainGraph(); if (!ep_ctx_embed_mode_) { auto ep_ctx_cache_path_str = GetEPContextCacheFileLocation(ep_ctx_model_file_loc_, model_path_str_); - std::ofstream ep_ctx_cache_ofs(ep_ctx_cache_path_str.c_str(), std::ios::trunc); + std::ofstream ep_ctx_cache_ofs(ep_ctx_cache_path_str.c_str(), std::ios::trunc | std::ios::binary); if (!ep_ctx_cache_ofs.is_open()) { ORT_THROW("Failed to open a file to write EP context cache: ", ep_ctx_cache_path_str.c_str()); } @@ -136,7 +136,7 @@ std::vector> VitisAIExecutionProvider::GetCap info_["cacheDir"] = cache_dir; info_["cacheKey"] = cache_key; LOGS_DEFAULT(VERBOSE) << "Trying getting compilation cache from " << PathToUTF8String(ep_ctx_model_file_loc_); - auto ep_ctx_payload = RetrieveEPContextCache(graph_viewer.GetGraph(), ep_ctx_model_file_loc_, false); + auto ep_ctx_payload = RetrieveEPContextCache(graph_viewer.GetGraph(), ep_ctx_model_file_loc_, true); restore_backend_compilation_cache(cache_dir, cache_key, ep_ctx_payload, graph_viewer.ModelPath().string()); } else { if (fs::exists(ep_ctx_model_file_loc_) && fs::is_regular_file(ep_ctx_model_file_loc_) && ep_ctx_enabled_) { From f3a6e58ae3358da65f1753ca2322e5f4475ae661 Mon Sep 17 00:00:00 2001 From: Sophie Schoenmeyer <107952697+sophies927@users.noreply.github.com> Date: Thu, 25 Jul 2024 09:52:37 -0700 Subject: [PATCH 17/57] Update 05-performance.yml issue template to auto apply label (#21486) Updating Performance issue template so "performance" label is automatically applied ### Description ### Motivation and Context --- .github/ISSUE_TEMPLATE/05-performance.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/ISSUE_TEMPLATE/05-performance.yml b/.github/ISSUE_TEMPLATE/05-performance.yml index 829076a1bd466..da0e6c7ada7a7 100644 --- a/.github/ISSUE_TEMPLATE/05-performance.yml +++ b/.github/ISSUE_TEMPLATE/05-performance.yml @@ -1,6 +1,7 @@ name: Performance description: issues related to performance title: "[Performance] " +labels: ["performance"] body: - type: markdown attributes: From ebcb7075ebd5657069c9b00be4bde0bc814307c6 Mon Sep 17 00:00:00 2001 From: Yifan Li <109183385+yf711@users.noreply.github.com> Date: Thu, 25 Jul 2024 10:17:16 -0700 Subject: [PATCH 18/57] Set CUDA12 as default in GPU packages (#21438) ### Description * Swap cuda version 11.8/12.2 in GPU CIs * Set CUDA12 as default version in yamls of publishing nuget/python/java GPU packages * Suppress warnings as errors of flash_api.cc during ort win-build --- .../cuda/bert/flash_attention/flash_api.cc | 8 ++++++ .../azure-pipelines/linux-gpu-ci-pipeline.yml | 2 +- .../linux-gpu-tensorrt-ci-pipeline.yml | 2 +- ...linux-gpu-tensorrt-daily-perf-pipeline.yml | 8 +++--- .../nuget-cuda-publishing-pipeline.yml | 15 ++++++----- .../github/azure-pipelines/publish-nuget.yml | 26 +++++++++++++++---- .../py-cuda-publishing-pipeline.yml | 2 +- .../stages/java-cuda-publishing-stage.yml | 2 +- .../jobs/download_win_gpu_library.yml | 2 +- .../templates/jobs/set-winenv.yml | 4 +-- .../azure-pipelines/win-gpu-ci-pipeline.yml | 13 +++++++--- .../win-gpu-tensorrt-ci-pipeline.yml | 19 ++++++++++++-- .../docker/Dockerfile.manylinux2_28_cuda | 2 +- .../Dockerfile.package_ubi8_cuda_tensorrt10_0 | 6 ++--- .../github/windows/setup_env_cuda.bat | 14 +++++----- .../ci_build/github/windows/setup_env_gpu.bat | 16 ++++++------ .../ci_build/github/windows/setup_env_trt.bat | 8 +++--- 17 files changed, 97 insertions(+), 52 deletions(-) diff --git a/onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_api.cc b/onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_api.cc index 90f0b94cafce8..967c04c52b182 100644 --- a/onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_api.cc +++ b/onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_api.cc @@ -92,6 +92,11 @@ void set_params_fprop(Flash_fwd_params& params, params.softmax_lse_ptr = softmax_lse_d; // Set the dimensions. +#if defined(_MSC_VER) +#pragma warning(push) +#pragma warning(disable : 4267) // Ignore conversion from 'size_t' to 'int', possible loss of data +#pragma warning(disable : 4244) // Ignore conversion from 'double' to 'float', possible loss of data +#endif params.b = batch_size; params.h = num_heads; params.h_k = num_heads_k; @@ -119,6 +124,9 @@ void set_params_fprop(Flash_fwd_params& params, if (window_size_left >= 0 && window_size_right < 0) { window_size_right = seqlen_k; } +#if defined(_MSC_VER) +#pragma warning(pop) +#endif params.window_size_left = window_size_left; params.window_size_right = window_size_right; diff --git a/tools/ci_build/github/azure-pipelines/linux-gpu-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-gpu-ci-pipeline.yml index 30f56f4b18aec..d3e4a2e009598 100644 --- a/tools/ci_build/github/azure-pipelines/linux-gpu-ci-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/linux-gpu-ci-pipeline.yml @@ -30,7 +30,7 @@ parameters: - name: CudaVersion displayName: CUDA version type: string - default: '11.8' + default: '12.2' values: - 11.8 - 12.2 diff --git a/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-ci-pipeline.yml index 78e3b166995ec..5c7108861052e 100644 --- a/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-ci-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-ci-pipeline.yml @@ -30,7 +30,7 @@ parameters: - name: CudaVersion displayName: CUDA version type: string - default: '11.8' + default: '12.2' values: - 11.8 - 12.2 diff --git a/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-daily-perf-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-daily-perf-pipeline.yml index 7cfff805c3b3c..4ab1b4996a1db 100644 --- a/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-daily-perf-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-daily-perf-pipeline.yml @@ -8,14 +8,12 @@ parameters: - name: TrtVersion displayName: TensorRT Version type: string - default: 10.0.cuda_11_8_cudnn_8 + default: 10.2.cuda_12_5_cudnn_9 values: - - 8.4.cuda_11_6_cudnn_8 - - 8.5.cuda_11_8_cudnn_8 - 8.6.cuda_11_8_cudnn_8 - 8.6.cuda_12_3_cudnn_9 - - 10.0.cuda_11_8_cudnn_8 - - 10.0.cuda_12_4_cudnn_9 + - 10.2.cuda_11_8_cudnn_8 + - 10.2.cuda_12_5_cudnn_9 - BIN - name: UseTensorrtOssParser diff --git a/tools/ci_build/github/azure-pipelines/nuget-cuda-publishing-pipeline.yml b/tools/ci_build/github/azure-pipelines/nuget-cuda-publishing-pipeline.yml index 4bfd726f5c58c..aeb250e1e0cbc 100644 --- a/tools/ci_build/github/azure-pipelines/nuget-cuda-publishing-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/nuget-cuda-publishing-pipeline.yml @@ -6,6 +6,7 @@ resources: branches: include: - main + - rel-* branch: main parameters: @@ -16,15 +17,15 @@ parameters: variables: - name: ArtifactFeed ${{ if eq(parameters.isReleaseBuild, false) }}: - value: ort-cuda-12-nightly + value: ORT-Nightly ${{ else }}: value: onnxruntime-cuda-12 stages: -- template: stages/nuget-cuda-publishing-stage.yml - parameters: - artifact_feed: $(ArtifactFeed) + - template: stages/nuget-cuda-publishing-stage.yml + parameters: + artifact_feed: $(ArtifactFeed) -- template: stages/java-cuda-publishing-stage.yml - parameters: - artifact_feed: $(ArtifactFeed) + - template: stages/java-cuda-publishing-stage.yml + parameters: + artifact_feed: $(ArtifactFeed) \ No newline at end of file diff --git a/tools/ci_build/github/azure-pipelines/publish-nuget.yml b/tools/ci_build/github/azure-pipelines/publish-nuget.yml index e0c588413415b..206a9464de6ef 100644 --- a/tools/ci_build/github/azure-pipelines/publish-nuget.yml +++ b/tools/ci_build/github/azure-pipelines/publish-nuget.yml @@ -9,10 +9,22 @@ resources: - rel-* branch: main +parameters: + - name: isReleaseBuild + type: boolean + default: false + +variables: + - name: ArtifactFeed + ${{ if eq(parameters.isReleaseBuild, false) }}: + value: ort-cuda-11-nightly + ${{ else }}: + value: onnxruntime-cuda-11 + stages: - template: templates/publish-nuget-steps.yml parameters: - stage_name: 'Publish_NuGet_Packag_And_Report' + stage_name: 'Publish_NuGet_Package_And_Report' include_cpu_ep: true download_artifacts_steps: - download: build @@ -25,7 +37,11 @@ stages: artifact: 'drop-signed-nuget-Training-CPU' - script: move "$(Pipeline.Workspace)\build\drop-signed-nuget-Training-CPU\*" $(Build.BinariesDirectory)\nuget-artifact\final-package - - download: build - displayName: 'Download Pipeline Artifact - Signed NuGet Package' - artifact: 'drop-signed-nuget-GPU' - - script: move "$(Pipeline.Workspace)\build\drop-signed-nuget-GPU\*" $(Build.BinariesDirectory)\nuget-artifact\final-package + # Publish CUDA 11 Nuget/Java pkgs to ADO feed + - template: stages/nuget-cuda-publishing-stage.yml + parameters: + artifact_feed: $(ArtifactFeed) + + - template: stages/java-cuda-publishing-stage.yml + parameters: + artifact_feed: $(ArtifactFeed) \ No newline at end of file diff --git a/tools/ci_build/github/azure-pipelines/py-cuda-publishing-pipeline.yml b/tools/ci_build/github/azure-pipelines/py-cuda-publishing-pipeline.yml index 50e0ca3708d2d..1217163c07132 100644 --- a/tools/ci_build/github/azure-pipelines/py-cuda-publishing-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/py-cuda-publishing-pipeline.yml @@ -16,7 +16,7 @@ parameters: variables: - name: ArtifactFeed ${{ if eq(parameters.isReleaseBuild, false) }}: - value: ort-cuda-12-nightly + value: ORT-Nightly ${{ else }}: value: onnxruntime-cuda-12 diff --git a/tools/ci_build/github/azure-pipelines/stages/java-cuda-publishing-stage.yml b/tools/ci_build/github/azure-pipelines/stages/java-cuda-publishing-stage.yml index 70d92286b3964..946d651b795d4 100644 --- a/tools/ci_build/github/azure-pipelines/stages/java-cuda-publishing-stage.yml +++ b/tools/ci_build/github/azure-pipelines/stages/java-cuda-publishing-stage.yml @@ -8,7 +8,7 @@ stages: jobs: - job: JAR_Publishing_GPU #TD-DO: figure out a way to package nightly jar. Currently Java version are set from VERSION_NUMBER file - condition: ${{ eq(parameters.artifact_feed, 'onnxruntime-cuda-12') }} + condition: ${{ or(eq(parameters.artifact_feed, 'onnxruntime-cuda-11'), eq(parameters.artifact_feed, 'onnxruntime-cuda-12')) }} workspace: clean: all pool: 'onnxruntime-Win-CPU-2022' diff --git a/tools/ci_build/github/azure-pipelines/templates/jobs/download_win_gpu_library.yml b/tools/ci_build/github/azure-pipelines/templates/jobs/download_win_gpu_library.yml index de29a3de9fded..6459888a40aea 100644 --- a/tools/ci_build/github/azure-pipelines/templates/jobs/download_win_gpu_library.yml +++ b/tools/ci_build/github/azure-pipelines/templates/jobs/download_win_gpu_library.yml @@ -7,7 +7,7 @@ parameters: default: false - name: CudaVersion type: string - default: '11.8' + default: '12.2' values: - 11.8 - 12.2 diff --git a/tools/ci_build/github/azure-pipelines/templates/jobs/set-winenv.yml b/tools/ci_build/github/azure-pipelines/templates/jobs/set-winenv.yml index 63d521f1e7d9a..fba463b49016a 100644 --- a/tools/ci_build/github/azure-pipelines/templates/jobs/set-winenv.yml +++ b/tools/ci_build/github/azure-pipelines/templates/jobs/set-winenv.yml @@ -9,10 +9,10 @@ parameters: default: false - name: PrimaryCUDAVersion type: string - default: '11.8' + default: '12.2' - name: SecondaryCUDAVersion type: string - default: '12.2' + default: '11.8' steps: - ${{ if eq(parameters.DownloadCUDA, 'true') }}: diff --git a/tools/ci_build/github/azure-pipelines/win-gpu-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/win-gpu-ci-pipeline.yml index 438e51175c5b4..c5262880c4c55 100644 --- a/tools/ci_build/github/azure-pipelines/win-gpu-ci-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/win-gpu-ci-pipeline.yml @@ -28,6 +28,13 @@ pr: #### end trigger #### parameters: +- name: CudaVersion + displayName: CUDA version + type: string + default: '12.2' + values: + - 11.8 + - 12.2 - name: RunOnnxRuntimeTests displayName: Run Tests? type: boolean @@ -43,7 +50,7 @@ stages: EnvSetupScript: setup_env_cuda.bat buildArch: x64 additionalBuildFlags: >- - --enable_pybind --build_java --build_nodejs --use_cuda --cuda_home="$(Agent.TempDirectory)\v11.8" + --enable_pybind --build_java --build_nodejs --use_cuda --cuda_home="$(Agent.TempDirectory)\v${{ parameters.CudaVersion }}" --enable_cuda_profiling --enable_transformers_tool_test --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86 --cmake_extra_defines onnxruntime_BUILD_UNIT_TESTS=ON @@ -65,7 +72,7 @@ stages: EnvSetupScript: setup_env_cuda.bat buildArch: x64 additionalBuildFlags: >- - --enable_pybind --enable_training --use_cuda --cuda_home="$(Agent.TempDirectory)\v11.8" + --enable_pybind --enable_training --use_cuda --cuda_home="$(Agent.TempDirectory)\v${{ parameters.CudaVersion }}" --skip_onnx_tests --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86 msbuildPlatform: x64 @@ -105,7 +112,7 @@ stages: # note: need to specify `--gen_doc` when creating the build config so it has to be in additionalBuildFlags additionalBuildFlags: >- --gen_doc validate --skip_tests --enable_pybind --use_dml --use_cuda - --cuda_home="$(Agent.TempDirectory)\v11.8" + --cuda_home="$(Agent.TempDirectory)\v${{ parameters.CudaVersion }}" --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86 --cmake_extra_defines onnxruntime_BUILD_UNIT_TESTS=OFF msbuildPlatform: x64 diff --git a/tools/ci_build/github/azure-pipelines/win-gpu-tensorrt-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/win-gpu-tensorrt-ci-pipeline.yml index 70c0c7d4a04e7..8c9ecdfb90191 100644 --- a/tools/ci_build/github/azure-pipelines/win-gpu-tensorrt-ci-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/win-gpu-tensorrt-ci-pipeline.yml @@ -26,6 +26,21 @@ pr: - 'js/web' - 'onnxruntime/core/providers/js' #### end trigger #### +parameters: +- name: CudaVersion + displayName: CUDA version + type: string + default: '12.2' + values: + - 11.8 + - 12.2 + +variables: + - name: win_trt_folder + ${{ if eq(parameters.CudaVersion, '11.8') }}: + value: TensorRT-10.2.0.19.Windows10.x86_64.cuda-11.8 + ${{ if eq(parameters.CudaVersion, '12.2') }}: + value: TensorRT-10.2.0.19.Windows10.x86_64.cuda-12.5 jobs: - job: 'build' @@ -55,7 +70,7 @@ jobs: WithCache: True Today: $(TODAY) AdditionalKey: "gpu-tensorrt | RelWithDebInfo" - BuildPyArguments: '--config RelWithDebInfo --parallel --use_binskim_compliant_compile_flags --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --build_shared_lib --update --cmake_generator "Visual Studio 17 2022" --build_wheel --enable_onnx_tests --use_tensorrt --tensorrt_home="$(Agent.TempDirectory)\TensorRT-10.2.0.19.Windows10.x86_64.cuda-11.8" --cuda_home="$(Agent.TempDirectory)\v11.8" --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86' + BuildPyArguments: '--config RelWithDebInfo --parallel --use_binskim_compliant_compile_flags --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --build_shared_lib --update --cmake_generator "Visual Studio 17 2022" --build_wheel --enable_onnx_tests --use_tensorrt --tensorrt_home="$(Agent.TempDirectory)\${{ variables.win_trt_folder }}" --cuda_home="$(Agent.TempDirectory)\v${{ parameters.CudaVersion }}" --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86' MsbuildArguments: $(MsbuildArguments) BuildArch: 'x64' Platform: 'x64' @@ -75,7 +90,7 @@ jobs: del wheel_filename_file python.exe -m pip install -q --upgrade %WHEEL_FILENAME% set PATH=$(Build.BinariesDirectory)\RelWithDebInfo\RelWithDebInfo;%PATH% - python $(Build.SourcesDirectory)\tools\ci_build\build.py --config RelWithDebInfo --use_binskim_compliant_compile_flags --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --build_shared_lib --test --cmake_generator "Visual Studio 17 2022" --build_wheel --enable_onnx_tests --use_tensorrt --tensorrt_home="$(Agent.TempDirectory)\TensorRT-10.2.0.19.Windows10.x86_64.cuda-11.8" --cuda_home="$(Agent.TempDirectory)\v11.8" --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=75 + python $(Build.SourcesDirectory)\tools\ci_build\build.py --config RelWithDebInfo --use_binskim_compliant_compile_flags --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --build_shared_lib --test --cmake_generator "Visual Studio 17 2022" --build_wheel --enable_onnx_tests --use_tensorrt --tensorrt_home="$(Agent.TempDirectory)\${{ variables.win_trt_folder }}" --cuda_home="$(Agent.TempDirectory)\v${{ parameters.CudaVersion }}" --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86 workingDirectory: '$(Build.BinariesDirectory)\RelWithDebInfo\RelWithDebInfo' displayName: 'Run tests' diff --git a/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda b/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda index d96b342974273..07885ba65af8a 100644 --- a/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda +++ b/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda @@ -2,7 +2,7 @@ # Please overwrite BASEIMAGE, TRT_VERSION and other arguments with # --docker-build-args ' --build-arg BASEIMAGE=other_base_image --build-arg TRT_VERSION=other_trt_version etc...' # for other cuda version and TRT version -ARG BASEIMAGE=nvidia/cuda:11.8.0-cudnn8-devel-ubi8 +ARG BASEIMAGE=nvidia/cuda:12.5.1-cudnn-devel-ubi8 FROM $BASEIMAGE ARG TRT_VERSION diff --git a/tools/ci_build/github/linux/docker/Dockerfile.package_ubi8_cuda_tensorrt10_0 b/tools/ci_build/github/linux/docker/Dockerfile.package_ubi8_cuda_tensorrt10_0 index 2d3dc05285e3c..b587a7df554bd 100644 --- a/tools/ci_build/github/linux/docker/Dockerfile.package_ubi8_cuda_tensorrt10_0 +++ b/tools/ci_build/github/linux/docker/Dockerfile.package_ubi8_cuda_tensorrt10_0 @@ -2,11 +2,11 @@ # Copyright (c) Microsoft Corporation. All rights reserved. # Licensed under the MIT License. # -------------------------------------------------------------- -# Dockerfile to Test ONNX Runtime on UBI8 with TensorRT 10.0 and CUDA 11.8 by default +# Dockerfile to Test ONNX Runtime on UBI8 with TensorRT 10 and CUDA 12 by default # Build base image with required system packages -ARG BASEIMAGE=nvidia/cuda:11.8.0-cudnn8-devel-ubi8 -ARG TRT_VERSION=10.2.0.19-1.cuda11.8 +ARG BASEIMAGE=nvidia/cuda:12.5.1-cudnn-devel-ubi8 +ARG TRT_VERSION=10.2.0.19-1.cuda12.4 FROM $BASEIMAGE AS base ARG TRT_VERSION ENV PATH /opt/python/cp38-cp38/bin:/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/src/tensorrt/bin:${PATH} diff --git a/tools/ci_build/github/windows/setup_env_cuda.bat b/tools/ci_build/github/windows/setup_env_cuda.bat index 2233f7611ab6a..f93938e2a9009 100644 --- a/tools/ci_build/github/windows/setup_env_cuda.bat +++ b/tools/ci_build/github/windows/setup_env_cuda.bat @@ -1,17 +1,17 @@ REM Copyright (c) Microsoft Corporation. All rights reserved. REM Licensed under the MIT License. -if exist PATH=%AGENT_TEMPDIRECTORY%\v11.8\ ( -set PATH=%AGENT_TEMPDIRECTORY%\v11.8\bin;%AGENT_TEMPDIRECTORY%\v11.8\extras\CUPTI\lib64;%PATH% +if exist PATH=%AGENT_TEMPDIRECTORY%\v12.2\ ( +set PATH=%AGENT_TEMPDIRECTORY%\v12.2\bin;%AGENT_TEMPDIRECTORY%\v12.2\extras\CUPTI\lib64;%PATH% ) else ( - set PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.8\bin;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.8\extras\CUPTI\lib64;%PATH% + set PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.2\bin;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.2\extras\CUPTI\lib64;%PATH% ) -@REM The default version is still cuda v11.8, because set cuda v12.2 after it -if exist PATH=%AGENT_TEMPDIRECTORY%\v12.2\ ( - set PATH=%PATH%;%AGENT_TEMPDIRECTORY%\v12.2\bin;%AGENT_TEMPDIRECTORY%\v12.2\extras\CUPTI\lib64 +@REM The default version is still cuda v12.2, because set cuda v11.8 after it +if exist PATH=%AGENT_TEMPDIRECTORY%\v11.8\ ( + set PATH=%PATH%;%AGENT_TEMPDIRECTORY%\v11.8\bin;%AGENT_TEMPDIRECTORY%\v11.8\extras\CUPTI\lib64 ) else ( - set PATH=%PATH%;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.2\bin;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.2\extras\CUPTI\lib64 + set PATH=%PATH%;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.8\bin;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.8\extras\CUPTI\lib64 ) set GRADLE_OPTS=-Dorg.gradle.daemon=false diff --git a/tools/ci_build/github/windows/setup_env_gpu.bat b/tools/ci_build/github/windows/setup_env_gpu.bat index 6c59866ea925a..35e4f7e302430 100644 --- a/tools/ci_build/github/windows/setup_env_gpu.bat +++ b/tools/ci_build/github/windows/setup_env_gpu.bat @@ -1,17 +1,17 @@ REM Copyright (c) Microsoft Corporation. All rights reserved. REM Licensed under the MIT License. -if exist PATH=%AGENT_TEMPDIRECTORY%\v11.8\ ( - set PATH=%AGENT_TEMPDIRECTORY%\v11.8\bin;%AGENT_TEMPDIRECTORY%\v11.8\extras\CUPTI\lib64;%PATH% +if exist PATH=%AGENT_TEMPDIRECTORY%\v12.2\ ( + set PATH=%AGENT_TEMPDIRECTORY%\v12.2\bin;%AGENT_TEMPDIRECTORY%\v12.2\extras\CUPTI\lib64;%PATH% ) else ( - set PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.8\bin;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.8\extras\CUPTI\lib64;%PATH% + set PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.2\bin;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.2\extras\CUPTI\lib64;%PATH% ) -set PATH=%AGENT_TEMPDIRECTORY%\TensorRT-10.2.0.19.Windows10.x86_64.cuda-11.8\lib;%PATH% +set PATH=%AGENT_TEMPDIRECTORY%\TensorRT-10.2.0.19.Windows10.x86_64.cuda-12.5\lib;%PATH% -@REM The default version is still cuda v11.8, because set cuda v12.2 after it -set PATH=%PATH%;%AGENT_TEMPDIRECTORY%\TensorRT-10.2.0.19.Windows10.x86_64.cuda-12.5\lib -if exist PATH=%AGENT_TEMPDIRECTORY%\v12.2\ ( - set PATH=%PATH%;%AGENT_TEMPDIRECTORY%\v12.2\bin;%AGENT_TEMPDIRECTORY%\v12.2\extras\CUPTI\lib64 +@REM The default version is still cuda v12.2, because set cuda v11.8 after it +set PATH=%PATH%;%AGENT_TEMPDIRECTORY%\TensorRT-10.2.0.19.Windows10.x86_64.cuda-11.8\lib +if exist PATH=%AGENT_TEMPDIRECTORY%\v11.8\ ( + set PATH=%PATH%;%AGENT_TEMPDIRECTORY%\v11.8\bin;%AGENT_TEMPDIRECTORY%\v11.8\extras\CUPTI\lib64 ) else ( set PATH=%PATH%;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\\bin;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\\extras\CUPTI\lib64 ) diff --git a/tools/ci_build/github/windows/setup_env_trt.bat b/tools/ci_build/github/windows/setup_env_trt.bat index 249bb98815897..7ec7558edab39 100644 --- a/tools/ci_build/github/windows/setup_env_trt.bat +++ b/tools/ci_build/github/windows/setup_env_trt.bat @@ -1,11 +1,11 @@ REM Copyright (c) Microsoft Corporation. All rights reserved. REM Licensed under the MIT License. -if exist PATH=%AGENT_TEMPDIRECTORY%\v11.8\ ( - set PATH=%PATH%;%AGENT_TEMPDIRECTORY%\v11.8\bin;%AGENT_TEMPDIRECTORY%\v11.8\extras\CUPTI\lib64 +if exist PATH=%AGENT_TEMPDIRECTORY%\v12.2\ ( + set PATH=%PATH%;%AGENT_TEMPDIRECTORY%\v12.2\bin;%AGENT_TEMPDIRECTORY%\v12.2\extras\CUPTI\lib64 ) else ( - set PATH=%PATH%;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.8\bin;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.8\extras\CUPTI\lib64 + set PATH=%PATH%;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.2\bin;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.2\extras\CUPTI\lib64 ) -set PATH=%AGENT_TEMPDIRECTORY%\TensorRT-10.2.0.19.Windows10.x86_64.cuda-11.8\lib;%PATH% +set PATH=%AGENT_TEMPDIRECTORY%\TensorRT-10.2.0.19.Windows10.x86_64.cuda-12.5\lib;%PATH% set GRADLE_OPTS=-Dorg.gradle.daemon=false set CUDA_MODULE_LOADING=LAZY \ No newline at end of file From 4167b68abf2715c52439874ad9ddeaebfb3dafcb Mon Sep 17 00:00:00 2001 From: Changming Sun Date: Thu, 25 Jul 2024 10:58:34 -0700 Subject: [PATCH 19/57] Split ondevice training cpu packaging pipeline to a separated pipeline (#21485) ### Description Right now our "Zip-Nuget-Java-Nodejs Packaging Pipeline" is too big. This OnDevice training part is independent of the others, so it can be split out. Then our NPM Packaging pipeline will not depends on this training stuff. ### Motivation and Context Similar to #21235 Also, this PR fixed a problem that: "NuGet_Test_Linux_Training_CPU" job downloads artifacts from "onnxruntime-linux-x64" for getting customop shared libs, but the job forget to declare it depends on the "Linux_C_API_Packaging_CPU_x64" which produces the artifact. Such problems can be hard to find when a pipeline goes big. --- .../c-api-noopenmp-packaging-pipelines.yml | 11 ---- .../c-api-training-packaging-pipelines.yml | 51 +++++++++++++++++++ ...device-training-cpu-packaging-pipeline.yml | 1 + 3 files changed, 52 insertions(+), 11 deletions(-) create mode 100644 tools/ci_build/github/azure-pipelines/c-api-training-packaging-pipelines.yml diff --git a/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml b/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml index 2eb7046d80e7a..51b73acd93dc8 100644 --- a/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml +++ b/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml @@ -112,17 +112,6 @@ stages: SpecificArtifact: ${{ parameters.SpecificArtifact }} BuildId: ${{ parameters.BuildId }} -- template: templates/ondevice-training-cpu-packaging-pipeline.yml - parameters: - RunOnnxRuntimeTests: ${{ parameters.RunOnnxRuntimeTests }} - DoCompliance: ${{ parameters.DoCompliance }} - DoEsrp: ${{ parameters.DoEsrp }} - IsReleaseBuild: ${{ parameters.IsReleaseBuild }} - OrtNugetPackageId: 'Microsoft.ML.OnnxRuntime.Training' - AdditionalBuildFlags: '--enable_training_apis' - AdditionalWinBuildFlags: '--enable_onnx_tests --enable_wcos' - BuildVariant: 'default' - - template: stages/java-cuda-packaging-stage.yml parameters: CudaVersion: 11.8 diff --git a/tools/ci_build/github/azure-pipelines/c-api-training-packaging-pipelines.yml b/tools/ci_build/github/azure-pipelines/c-api-training-packaging-pipelines.yml new file mode 100644 index 0000000000000..aecece05a0e58 --- /dev/null +++ b/tools/ci_build/github/azure-pipelines/c-api-training-packaging-pipelines.yml @@ -0,0 +1,51 @@ +parameters: +- name: RunOnnxRuntimeTests + displayName: Run Tests? + type: boolean + default: true + +- name: DoCompliance + displayName: Run Compliance Tasks? + type: boolean + default: true + +- name: DoEsrp + displayName: Run code sign tasks? Must be true if you are doing an ONNX Runtime release + type: boolean + default: true + +- name: IsReleaseBuild + displayName: Is a release build? Set it to true if you are doing an ONNX Runtime release. + type: boolean + default: false +- name: PreReleaseVersionSuffixString + displayName: Suffix added to pre-release package version. Only used if IsReleaseBuild is true. Denotes the type of pre-release package. + type: string + values: + - alpha + - beta + - rc + - none + default: none + +- name: PreReleaseVersionSuffixNumber + displayName: Number added to pre-release package version. Only used if IsReleaseBuild is true. Denotes the sequence of a pre-release package. + type: number + default: 0 + +stages: +- template: stages/set_packaging_variables_stage.yml + parameters: + IsReleaseBuild: ${{ parameters.IsReleaseBuild }} + PreReleaseVersionSuffixString: ${{ parameters.PreReleaseVersionSuffixString }} + PreReleaseVersionSuffixNumber: ${{ parameters.PreReleaseVersionSuffixNumber }} +- template: templates/ondevice-training-cpu-packaging-pipeline.yml + parameters: + RunOnnxRuntimeTests: ${{ parameters.RunOnnxRuntimeTests }} + DoCompliance: ${{ parameters.DoCompliance }} + DoEsrp: ${{ parameters.DoEsrp }} + IsReleaseBuild: ${{ parameters.IsReleaseBuild }} + OrtNugetPackageId: 'Microsoft.ML.OnnxRuntime.Training' + AdditionalBuildFlags: '--enable_training_apis' + AdditionalWinBuildFlags: '--enable_onnx_tests --enable_wcos' + BuildVariant: 'default' \ No newline at end of file diff --git a/tools/ci_build/github/azure-pipelines/templates/ondevice-training-cpu-packaging-pipeline.yml b/tools/ci_build/github/azure-pipelines/templates/ondevice-training-cpu-packaging-pipeline.yml index fb9ff65fe8534..022f85cc0a463 100644 --- a/tools/ci_build/github/azure-pipelines/templates/ondevice-training-cpu-packaging-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/templates/ondevice-training-cpu-packaging-pipeline.yml @@ -317,3 +317,4 @@ stages: ArtifactSuffix: 'Training-CPU' StageSuffix: 'Training_CPU' NativePackagePrefix: 'onnxruntime-training' + CustomOpArtifactName: 'onnxruntime-training-linux-x64' From c23517859eb67a0a03f9777e9c741b9ebaabd6eb Mon Sep 17 00:00:00 2001 From: Hector Li Date: Thu, 25 Jul 2024 11:44:10 -0700 Subject: [PATCH 20/57] Qnn batchnorm support input with rank 2 (#21469) ### Description Qnn BatchNorm support input with rank 2 Update Quantization script to quantize BatchNorm bias using int32 --------- Co-authored-by: Justin Chu --- .../selectors_actions/qdq_selectors.cc | 2 +- .../opbuilder/batch_norm_op_builder.cc | 21 +++- .../builder/opbuilder/expand_op_builder.cc | 2 +- .../core/providers/qnn/builder/qnn_utils.cc | 23 ++++- .../core/providers/qnn/builder/qnn_utils.h | 5 +- .../tools/quantization/operators/norm.py | 2 +- .../python/tools/quantization/registry.py | 1 + .../test/providers/qnn/batch_norm_htp_test.cc | 99 ++++++++++++------- .../test/providers/qnn/qnn_test_utils.h | 6 +- 9 files changed, 111 insertions(+), 50 deletions(-) diff --git a/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.cc b/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.cc index 6e93445c7c5c7..e271ae8df3356 100644 --- a/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.cc +++ b/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.cc @@ -632,7 +632,7 @@ bool BatchNormalizationNodeGroupSelector::Check(const GraphViewer& graph_viewer, const Node& node, const std::vector& dq_nodes, const std::vector& q_nodes) const { - if (!CheckQDQNodes(graph_viewer, node, dq_nodes, q_nodes)) { + if (!CheckQDQNodes(graph_viewer, node, dq_nodes, q_nodes, 3)) { return false; } diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/batch_norm_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/batch_norm_op_builder.cc index 16a058854a743..07abcf1c7bf84 100644 --- a/onnxruntime/core/providers/qnn/builder/opbuilder/batch_norm_op_builder.cc +++ b/onnxruntime/core/providers/qnn/builder/opbuilder/batch_norm_op_builder.cc @@ -392,15 +392,23 @@ class BatchNormOpBuilder : public BaseOpBuilder { const double rmin, QnnQuantParamsWrapper& quant_param, std::vector& raw_tensor) const { + bool symmetric = false; if (info.quant_param.IsQuantized()) { - raw_tensor.resize(double_tensor.size()); + size_t data_size = double_tensor.size(); + // QNN BatchNorm int32 bias requires symmetric quantizated + if (info.qnn_data_type == QNN_DATATYPE_SFIXED_POINT_32) { + data_size *= sizeof(int32_t); + symmetric = true; + } + raw_tensor.resize(data_size); float scale = 0.0f; - int zero_point = 0; + int32_t zero_point = 0; ORT_RETURN_IF_ERROR(utils::GetQuantParams(static_cast(rmin), static_cast(rmax), info.qnn_data_type, scale, - zero_point)); + zero_point, + symmetric)); quant_param = QnnQuantParamsWrapper(scale, zero_point); for (size_t i = 0; i < double_tensor.size(); ++i) { // onnx only supports 8 bits quantization @@ -411,6 +419,10 @@ class BatchNormOpBuilder : public BaseOpBuilder { } else if (info.qnn_data_type == QNN_DATATYPE_SFIXED_POINT_8) { int8_t quant_value = static_cast(quant_value_int); raw_tensor[i] = *reinterpret_cast(&quant_value); + } else if (info.qnn_data_type == QNN_DATATYPE_SFIXED_POINT_32) { + int32_t quant_value = static_cast(quant_value_int); + size_t pos = i * sizeof(int32_t); + std::memcpy(&raw_tensor[pos], reinterpret_cast(&quant_value), sizeof(int32_t)); } else { // TODO(adrianlizarraga): Should support 16-bit quantization as well. ORT_RETURN_IF(true, "Qnn Data Type: %d not supported yet.", info.qnn_data_type); @@ -444,8 +456,7 @@ Status BatchNormOpBuilder::IsOpSupported(QnnModelWrapper& qnn_model_wrapper, ORT_RETURN_IF_NOT(qnn_model_wrapper.GetOnnxShape(inputs[0].node_arg, input_shape), "Cannot get shape of input 0."); const size_t input_rank = input_shape.size(); - ORT_RETURN_IF(input_rank <= 2 || input_rank > 4, - "QNN BatchNorm only supports input ranks of size 3 or 4."); + ORT_RETURN_IF(input_rank > 4, "QNN BatchNorm only supports input ranks of size <= 4."); const uint32_t num_channels = input_shape[1]; diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/expand_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/expand_op_builder.cc index d0f6ce9effd9e..64f676aaa9875 100644 --- a/onnxruntime/core/providers/qnn/builder/opbuilder/expand_op_builder.cc +++ b/onnxruntime/core/providers/qnn/builder/opbuilder/expand_op_builder.cc @@ -79,7 +79,7 @@ Status ExpandOpBuilder::ProcessInputs(QnnModelWrapper& qnn_model_wrapper, if (is_quantized_tensor) { ORT_RETURN_IF_ERROR(utils::GetQnnDataType(true, type_proto, qnn_data_type)); float scale = 0.0f; - int zero_point = 0; + int32_t zero_point = 0; float rmax = 1.0f; float rmin = 1.0f; ORT_RETURN_IF_ERROR(utils::GetQuantParams(rmin, diff --git a/onnxruntime/core/providers/qnn/builder/qnn_utils.cc b/onnxruntime/core/providers/qnn/builder/qnn_utils.cc index c2e500b8980ad..d6c93a8f226e8 100644 --- a/onnxruntime/core/providers/qnn/builder/qnn_utils.cc +++ b/onnxruntime/core/providers/qnn/builder/qnn_utils.cc @@ -509,6 +509,9 @@ Status GetQminQmax(const Qnn_DataType_t qnn_data_type, } else if (qnn_data_type == QNN_DATATYPE_UFIXED_POINT_16) { qmin = static_cast(std::numeric_limits::min()); qmax = static_cast(std::numeric_limits::max()); + } else if (qnn_data_type == QNN_DATATYPE_SFIXED_POINT_32) { + qmin = static_cast(std::numeric_limits::min()); + qmax = static_cast(std::numeric_limits::max()); } else { ORT_RETURN_IF(true, "Qnn Data Type: %d not supported yet.", qnn_data_type); } @@ -519,15 +522,27 @@ Status GetQuantParams(float rmin, float rmax, const Qnn_DataType_t qnn_data_type, float& scale, - int& zero_point) { + int32_t& zero_point, + bool symmetric) { std::tie(rmin, rmax) = CheckMinMax(rmin, rmax); + if (symmetric) { + float abs_max = std::max(abs(rmax), abs(rmin)); + rmax = abs_max; + rmin = -abs_max; + } + float qmin = 0.0f; float qmax = 255.0f; ORT_RETURN_IF_ERROR(GetQminQmax(qnn_data_type, qmin, qmax)); scale = (rmax - rmin) / (qmax - qmin); - const float initial_zero_point = qmin - (rmin / scale); - zero_point = static_cast(RoundHalfToEven(Saturate(qmax, qmin, initial_zero_point))); + float initial_zero_point = 0.0f; + if (symmetric) { + initial_zero_point = std::round(rmin + rmax) / 2; + } else { + initial_zero_point = qmin - (rmin / scale); + } + zero_point = static_cast(RoundHalfToEven(Saturate(qmax, qmin, initial_zero_point))); // To match QNN quantization definition zero_point = 0 - zero_point; return Status::OK(); @@ -541,7 +556,7 @@ double Dequantize(int32_t offset, float scale, const double quant_value) { Status Quantize(const double double_value, const float scale, - const int zero_point, + const int32_t zero_point, const Qnn_DataType_t qnn_data_type, int& quant_value) { int qmin = 0; diff --git a/onnxruntime/core/providers/qnn/builder/qnn_utils.h b/onnxruntime/core/providers/qnn/builder/qnn_utils.h index 2392040d284b7..aa4a27460563f 100644 --- a/onnxruntime/core/providers/qnn/builder/qnn_utils.h +++ b/onnxruntime/core/providers/qnn/builder/qnn_utils.h @@ -93,13 +93,14 @@ Status GetQuantParams(float rmin, float rmax, const Qnn_DataType_t qnn_data_type, float& scale, - int& zero_point); + int32_t& zero_point, + bool symmetric = false); double Dequantize(int32_t offset, float scale, const double quant_value); Status Quantize(const double double_value, const float scale, - const int zero_point, + const int32_t zero_point, const Qnn_DataType_t qnn_data_type, int& quant_value); diff --git a/onnxruntime/python/tools/quantization/operators/norm.py b/onnxruntime/python/tools/quantization/operators/norm.py index 8c4c6c78582ac..10d96cc49855e 100644 --- a/onnxruntime/python/tools/quantization/operators/norm.py +++ b/onnxruntime/python/tools/quantization/operators/norm.py @@ -12,7 +12,7 @@ def __init__(self, onnx_quantizer, onnx_node): def quantize(self): node = self.node - assert node.op_type == "InstanceNormalization" or node.op_type == "LayerNormalization" + assert node.op_type in {"InstanceNormalization", "LayerNormalization", "BatchNormalization"} # Input self.quantizer.quantize_activation_tensor(node.input[0]) diff --git a/onnxruntime/python/tools/quantization/registry.py b/onnxruntime/python/tools/quantization/registry.py index b00e830a2a366..caac829126e38 100644 --- a/onnxruntime/python/tools/quantization/registry.py +++ b/onnxruntime/python/tools/quantization/registry.py @@ -82,6 +82,7 @@ "Where": QDQWhere, "InstanceNormalization": QDQNormalization, "LayerNormalization": QDQNormalization, + "BatchNormalization": QDQNormalization, } diff --git a/onnxruntime/test/providers/qnn/batch_norm_htp_test.cc b/onnxruntime/test/providers/qnn/batch_norm_htp_test.cc index 036c5760ed560..0a39413a4ec1b 100644 --- a/onnxruntime/test/providers/qnn/batch_norm_htp_test.cc +++ b/onnxruntime/test/providers/qnn/batch_norm_htp_test.cc @@ -80,8 +80,7 @@ template static GetTestModelFn BuildBatchNormTestCase(const TestInputDef& input_def, const TestInputDef& scale_def, const TestInputDef& bias_def) { - ORT_ENFORCE(input_def.IsRawData()); // Need raw data to compute mean and variance inputs. - ORT_ENFORCE(input_def.GetShape().size() > 2); // Need at least rank 3 data for convenience. + ORT_ENFORCE(input_def.IsRawData()); // Need raw data to compute mean and variance inputs. return [input_def, scale_def, bias_def](ModelTestBuilder& builder) { const auto& input_shape = input_def.GetShape(); @@ -103,45 +102,39 @@ static GetTestModelFn BuildBatchNormTestCase(const TestInputDef& inp }; } -template +template GetTestQDQModelFn BuildQDQBatchNormTestCase(const TestInputDef& input_def, const TestInputDef& scale_def, const TestInputDef& bias_def) { - ORT_ENFORCE(input_def.IsRawData()); // Need raw data to compute mean and variance inputs. - ORT_ENFORCE(input_def.GetShape().size() > 2); // Need at least rank 3 data for convenience. + ORT_ENFORCE(input_def.IsRawData()); // Need raw data to compute mean and variance inputs. return [input_def, scale_def, bias_def](ModelTestBuilder& builder, std::vector>& output_qparams) { const auto& input_shape = input_def.GetShape(); const auto& input_data = input_def.GetRawData(); const int64_t num_channels = input_shape[1]; - + bool symmetric = sizeof(InputQType) == sizeof(uint16_t); NodeArg* input = MakeTestInput(builder, input_def); - QuantParams input_qparams = GetTestInputQuantParams(input_def); + QuantParams input_qparams = GetTestInputQuantParams(input_def, symmetric); NodeArg* input_qdq = AddQDQNodePair(builder, input, input_qparams.scale, input_qparams.zero_point); NodeArg* scale = MakeTestInput(builder, scale_def); QuantParams scale_qparams = GetTestInputQuantParams(scale_def); NodeArg* scale_qdq = AddQDQNodePair(builder, scale, scale_qparams.scale, scale_qparams.zero_point); - NodeArg* bias = MakeTestInput(builder, bias_def); - QuantParams bias_qparams = GetTestInputQuantParams(bias_def); - NodeArg* bias_qdq = AddQDQNodePair(builder, bias, bias_qparams.scale, bias_qparams.zero_point); + NodeArg* bias_qdq; + // bias (as int32) => DQ => + bias_qdq = MakeTestQDQBiasInput(builder, bias_def, input_qparams.scale * scale_qparams.scale, true); std::vector mean_vals(num_channels); std::vector var_vals(num_channels); ComputeChannelMeanAndVar(input_data, input_shape, mean_vals, var_vals); NodeArg* mean = builder.MakeInitializer({num_channels}, mean_vals); - QuantParams mean_qparams = GetDataQuantParams(mean_vals); - NodeArg* mean_qdq = AddQDQNodePair(builder, mean, mean_qparams.scale, mean_qparams.zero_point); - NodeArg* var = builder.MakeInitializer({num_channels}, var_vals); - QuantParams var_qparams = GetDataQuantParams(var_vals); - NodeArg* var_qdq = AddQDQNodePair(builder, var, var_qparams.scale, var_qparams.zero_point); auto* batchnorm_output = builder.MakeIntermediate(); - builder.AddNode("BatchNormalization", {input_qdq, scale_qdq, bias_qdq, mean_qdq, var_qdq}, + builder.AddNode("BatchNormalization", {input_qdq, scale_qdq, bias_qdq, mean, var}, {batchnorm_output}); AddQDQNodePairWithOutputAsGraphOutput(builder, batchnorm_output, output_qparams[0].scale, output_qparams[0].zero_point); @@ -155,6 +148,7 @@ GetTestQDQModelFn BuildQDQBatchNormTestCase(const TestInputDef static void RunBatchNormQDQTest(const TestInputDef& input_def, const TestInputDef& scale_def, const TestInputDef& bias_def, @@ -169,9 +163,9 @@ static void RunBatchNormQDQTest(const TestInputDef& input_def, // Runs model with DQ-> InstanceNorm -> Q and compares the outputs of the CPU and QNN EPs. TestQDQModelAccuracy(BuildBatchNormTestCase(input_def, scale_def, bias_def), - BuildQDQBatchNormTestCase(input_def, scale_def, bias_def), + BuildQDQBatchNormTestCase(input_def, scale_def, bias_def), provider_options, - 11, + 21, expected_ep_assignment, tolerance); } @@ -199,31 +193,69 @@ static void RunBatchNormFP16Test(const TestInputDef& input_def, expected_ep_assignment); } +// BatchNor QDQ model, input with rank 2. +TEST_F(QnnHTPBackendTests, BatchNormRank2) { + constexpr int64_t num_channels = 2; + + RunBatchNormQDQTest(TestInputDef({4, num_channels}, false, + {-8.0f, -6.0f, -4.0f, -2.0f, 0.0f, 1.1f, 3.3f, 8.0f}), // Input data + TestInputDef({num_channels}, true, {1.0f, 2.0f}), // Scale initializer + TestInputDef({num_channels}, true, {1.1f, 2.1f}), // Bias initializer + ExpectedEPNodeAssignment::All); +} + // TODO: FIX TRANSLATION!!! // Check that QNN compiles DQ -> BatchNormalization -> Q as a single unit. // Use an input of rank 3. +// Accuracy issue with Linux simulator, not sure with Android device +// Inaccuracy detected for output 'output_0', element 1 +// output_range=4.8666362762451172, tolerance=0.40000000596046448%. +// Expected val (f32@CPU_EP): 1.0999999046325684 +// qdq@QNN_EP val: -0.17176364362239838 (err: 1.2717635631561279, err/output_range: 26.132291793823242%) +// qdq@CPU_EP val: 1.1069211959838867 (err: 0.0069212913513183594, err/output_range: 0.14221921563148499%) +// abs(qdq@QNN_EP - qdq@CPU_EP) / output_range = 25.990072250366211% +// +// Inaccuracy detected for output 'output_0', element 2 +// output_range=4.8666362762451172, tolerance=0.40000000596046448%. +// Expected val (f32@CPU_EP): 2.3247356414794922 +// qdq@QNN_EP val: -0.17176364362239838 (err: 2.4964993000030518, err/output_range: 51.298248291015625%) +// qdq@CPU_EP val: 2.3474364280700684 (err: 0.022700786590576172, err/output_range: 0.46645742654800415%) +#if defined(_WIN32) TEST_F(QnnHTPBackendTests, BatchNorm1D) { constexpr int64_t num_channels = 2; - RunBatchNormQDQTest(TestInputDef({1, num_channels, 3}, false, {-5.0f, -4.0f, -3.0f, 0.0f, 2.0f, 5.0f}), // Input data - TestInputDef({num_channels}, true, {1.0f, 2.0f}), // Scale initializer - TestInputDef({num_channels}, true, {1.1f, 2.1f}), // Bias initializer - ExpectedEPNodeAssignment::All); + RunBatchNormQDQTest(TestInputDef({1, num_channels, 3}, false, + {-5.0f, -4.0f, -3.0f, 0.0f, 2.0f, 5.0f}), // Input data + TestInputDef({num_channels}, true, {1.0f, 2.0f}), // Scale initializer + TestInputDef({num_channels}, true, {1.1f, 2.1f}), // Bias initializer + ExpectedEPNodeAssignment::All); +} +#endif + +// Check that QNN compiles DQ -> BatchNormalization -> Q as a single unit. +// Use an input of rank 4. +TEST_F(QnnHTPBackendTests, BatchNorm2D_a8w8) { + constexpr int64_t num_channels = 2; + std::vector input_data = {-8.0f, -6.0f, -4.0f, -2.0f, 0.0f, 1.1f, 3.3f, 8.0f, + -7.0f, -5.0f, -3.0f, -1.0f, 0.0f, 2.1f, 4.3f, 7.0f}; + + RunBatchNormQDQTest(TestInputDef({2, num_channels, 2, 2}, false, input_data), // Input data + TestInputDef({num_channels}, true, {1.0f, 2.0f}), // Scale initializer + TestInputDef({num_channels}, true, {1.1f, 2.1f}), // Bias initializer + ExpectedEPNodeAssignment::All); } // Check that QNN compiles DQ -> BatchNormalization -> Q as a single unit. // Use an input of rank 4. -TEST_F(QnnHTPBackendTests, BatchNorm2D) { +TEST_F(QnnHTPBackendTests, BatchNorm2D_a16w8) { constexpr int64_t num_channels = 2; std::vector input_data = {-8.0f, -6.0f, -4.0f, -2.0f, 0.0f, 1.1f, 3.3f, 8.0f, -7.0f, -5.0f, -3.0f, -1.0f, 0.0f, 2.1f, 4.3f, 7.0f}; - RunBatchNormQDQTest(TestInputDef({2, num_channels, 2, 2}, false, input_data), // Input data - TestInputDef({num_channels}, true, {1.0f, 2.0f}), // Scale initializer - TestInputDef({num_channels}, true, {1.1f, 2.1f}), // Bias initializer - ExpectedEPNodeAssignment::All, - // Require a slightly increased tolerance on Windows ARM64 (from 0.4% to 0.6%). - QDQTolerance(0.006f)); + RunBatchNormQDQTest(TestInputDef({2, num_channels, 2, 2}, false, input_data), // Input data + TestInputDef({num_channels}, true, {1.0f, 2.0f}), // Scale initializer + TestInputDef({num_channels}, true, {1.1f, 2.1f}), // Bias initializer + ExpectedEPNodeAssignment::All); } // Test FP16 BatchNormalization on the HTP backend. @@ -272,10 +304,11 @@ TEST_F(QnnHTPBackendTests, BatchNorm_FP32_as_FP16) { TEST_F(QnnHTPBackendTests, BatchNorm3D) { constexpr int64_t num_channels = 2; constexpr int64_t num_elems = 1 * num_channels * 3 * 4 * 5; - RunBatchNormQDQTest(TestInputDef({1, num_channels, 3, 4, 5}, false, std::vector(num_elems)), // Input data (all zeros) - TestInputDef({num_channels}, true, {1.0f, 2.0f}), // Scale initializer - TestInputDef({num_channels}, true, {1.1f, 2.1f}), // Bias initializer - ExpectedEPNodeAssignment::None); + RunBatchNormQDQTest(TestInputDef({1, num_channels, 3, 4, 5}, false, + std::vector(num_elems)), // Input data (all zeros) + TestInputDef({num_channels}, true, {1.0f, 2.0f}), // Scale initializer + TestInputDef({num_channels}, true, {1.1f, 2.1f}), // Bias initializer + ExpectedEPNodeAssignment::None); } #endif // defined(__aarch64__) || defined(_M_ARM64) || defined(__linux__) diff --git a/onnxruntime/test/providers/qnn/qnn_test_utils.h b/onnxruntime/test/providers/qnn/qnn_test_utils.h index eb03270dc8461..3a6753e9b6131 100644 --- a/onnxruntime/test/providers/qnn/qnn_test_utils.h +++ b/onnxruntime/test/providers/qnn/qnn_test_utils.h @@ -42,7 +42,7 @@ struct QuantParams { symmetric); } - static QuantParams Compute(float rmin, float rmax, QType qmin, QType qmax, bool symmetric = false) { + static QuantParams Compute(float rmin, float rmax, float qmin, float qmax, bool symmetric = false) { // Ensure a minimum range of 0.0001 (required by QNN) rmax = std::max(rmax, rmin + 0.0001f); @@ -56,8 +56,8 @@ struct QuantParams { rmin = -abs_max; } - float qmin_flt = static_cast(qmin); - float qmax_flt = static_cast(qmax); + float qmin_flt = qmin; + float qmax_flt = qmax; const float scale = (rmax - rmin) / (qmax_flt - qmin_flt); float initial_zero_point = 0.0f; From 3cdf4b917b4c679f3f4152145f36c7705b12d2c3 Mon Sep 17 00:00:00 2001 From: Scott McKay Date: Fri, 26 Jul 2024 07:36:23 +1000 Subject: [PATCH 21/57] Fix Android CI Pipeline code coverage failure (#21504) ### Description Current failure is due to a version mismatch. Use llvm-cov from the Android NDK instead of the system gcov so that the version is correct. Also comment out publishing to the Azure dashboard to simplify the setup. The CI prints out the stats for review by developers. ### Motivation and Context Fix CI pipeline --- ...ndroid-x86_64-crosscompile-ci-pipeline.yml | 32 +++++++++---------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/tools/ci_build/github/azure-pipelines/android-x86_64-crosscompile-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/android-x86_64-crosscompile-ci-pipeline.yml index 10d9a9a24d88a..bcfe4cde9ce50 100644 --- a/tools/ci_build/github/azure-pipelines/android-x86_64-crosscompile-ci-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/android-x86_64-crosscompile-ci-pipeline.yml @@ -174,10 +174,10 @@ stages: - template: templates/clean-agent-build-directory-step.yml -- stage: MASTER_BUILD_STAGE - # The below jobs only run on master build. +- stage: MAIN_BUILD_STAGE + # The below jobs only run on build of main branch. # because coverage report is hard to support in cross machines. - displayName: NNAPI MASTER BUILD&TEST + displayName: NNAPI MAIN BUILD&TEST dependsOn: [] condition: in(variables['Build.Reason'], 'IndividualCI', 'BatchedCI') jobs: @@ -225,29 +225,29 @@ stages: --code_coverage displayName: NNAPI EP, Build, Test, CodeCoverage on Android Emulator + # We need to use llvm-cov from the NDK. - script: | - python3 -m pip install gcovr && \ - python3 tools/ci_build/coverage.py \ - --build_dir build_nnapi \ - --android_sdk_path $ANDROID_HOME + export GCOV="$ANDROID_NDK_HOME/toolchains/llvm/prebuilt/linux-x86_64/bin/llvm-cov gcov" + python3 -m pip install gcovr + python3 tools/ci_build/coverage.py --build_dir build_nnapi --android_sdk_path $ANDROID_HOME displayName: Retrieve runtime code coverage files from the emulator and analyze - script: cat '$(Build.SourcesDirectory)/build_nnapi/Debug/coverage_rpt.txt' displayName: Print coverage report + # - task: AzureCLI@2 + # displayName: 'Post Android Code Coverage To DashBoard' + # inputs: + # azureSubscription: AIInfraBuild + # scriptType: bash + # scriptPath: $(Build.SourcesDirectory)/tools/ci_build/github/linux/upload_code_coverage_data.sh + # arguments: '"$(Build.SourcesDirectory)/build_nnapi/Debug/coverage_rpt.txt" "https://dev.azure.com/onnxruntime/onnxruntime/_build/results?buildId=$(Build.BuildId)" arm android nnapi' + # workingDirectory: '$(Build.BinariesDirectory)' + - script: /bin/bash tools/ci_build/github/linux/ort_minimal/nnapi_minimal_build_minimal_ort_and_run_tests.sh $(pwd) # Build Minimal ORT with NNAPI and reduced Ops, run unit tests on Android Emulator displayName: Build Minimal ORT with NNAPI and run tests - - task: AzureCLI@2 - displayName: 'Post Android Code Coverage To DashBoard' - inputs: - azureSubscription: AIInfraBuild - scriptType: bash - scriptPath: $(Build.SourcesDirectory)/tools/ci_build/github/linux/upload_code_coverage_data.sh - arguments: '"$(Build.SourcesDirectory)/build_nnapi/Debug/coverage_rpt.txt" "https://dev.azure.com/onnxruntime/onnxruntime/_build/results?buildId=$(Build.BuildId)" arm android nnapi' - workingDirectory: '$(Build.BinariesDirectory)' - - template: templates/use-android-emulator.yml parameters: stop: true From b0e1f7f7988952166ec867600d9eb92fde0be157 Mon Sep 17 00:00:00 2001 From: Scott McKay Date: Fri, 26 Jul 2024 08:29:33 +1000 Subject: [PATCH 22/57] CoreML: Aggregated changes to add all required ops for priority model (#21472) ### Description Add these changes to one PR to simplify checkin - Add Concat (#21423) - Add DepthToSpace (#21426) - Add LeakyRelu (#21453) - Add test scripts (#21427) - Add ability to set coreml flags from python (#21434) Other changes - updated partitioning utils to support dropping constant initializers from a ComputeCapability's inputs. - noticed that the list of inputs to the coreml model was unexpectedly long due to this - we copy constant initializers to a CoreML model so don't need the originals, and if they remain as inputs ORT can't free them as they appear to be in use. ### Motivation and Context --- .lintrunner.toml | 1 + include/onnxruntime/core/graph/graph.h | 24 ++-- onnxruntime/core/graph/graph.cc | 60 +++++---- .../builders/impl/activation_op_builder.cc | 13 +- .../coreml/builders/impl/builder_utils.cc | 24 +++- .../coreml/builders/impl/builder_utils.h | 20 +++ .../coreml/builders/impl/concat_op_builder.cc | 85 ++++++++---- .../builders/impl/depthtospace_op_builder.cc | 124 +++++++++++++++--- .../builders/impl/gridsample_op_builder.cc | 4 +- .../coreml/builders/op_builder_factory.cc | 23 ++-- .../coreml/coreml_execution_provider.cc | 4 +- .../DebugMLProgram.md | 2 + .../mlprogram_test_scripts/concat_test.py | 33 +++++ .../convtranspose_test.py | 42 ++++++ .../depthtospace_test.py | 51 +++++++ .../coreml/mlprogram_test_scripts/div_test.py | 103 +++++++++++++++ .../dump_mlprogram_model.py | 0 .../mlprogram_test_scripts/gridsample_test.py | 114 ++++++++++++++++ .../mlprogram_test_scripts/resize_test.py | 51 +++++++ .../core/providers/partitioning_utils.cc | 39 +++--- .../core/providers/partitioning_utils.h | 25 ++-- .../providers/qnn/qnn_execution_provider.cc | 5 +- onnxruntime/core/session/inference_session.cc | 5 + .../python/onnxruntime_pybind_state.cc | 29 +++- .../test/optimizer/qdq_transformer_test.cc | 3 +- .../cpu/tensor/space_depth_ops_test.cc | 31 +++++ .../apple/coreml_supported_mlprogram_ops.md | 5 +- 27 files changed, 783 insertions(+), 137 deletions(-) rename onnxruntime/core/providers/coreml/{ => mlprogram_test_scripts}/DebugMLProgram.md (97%) create mode 100644 onnxruntime/core/providers/coreml/mlprogram_test_scripts/concat_test.py create mode 100644 onnxruntime/core/providers/coreml/mlprogram_test_scripts/convtranspose_test.py create mode 100644 onnxruntime/core/providers/coreml/mlprogram_test_scripts/depthtospace_test.py create mode 100644 onnxruntime/core/providers/coreml/mlprogram_test_scripts/div_test.py rename onnxruntime/core/providers/coreml/{ => mlprogram_test_scripts}/dump_mlprogram_model.py (100%) create mode 100644 onnxruntime/core/providers/coreml/mlprogram_test_scripts/gridsample_test.py create mode 100644 onnxruntime/core/providers/coreml/mlprogram_test_scripts/resize_test.py diff --git a/.lintrunner.toml b/.lintrunner.toml index e6d06b34726fe..e1b24b2955b03 100644 --- a/.lintrunner.toml +++ b/.lintrunner.toml @@ -137,6 +137,7 @@ exclude_patterns = [ 'onnxruntime/core/mickey/gemm/**', # CUTLASS based libs recommends NO automatic code formatting 'winml/lib/Api.Image/shaders/**', # Contains data chunks 'onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_fwd_launch_template.h', # Bool Switches hang Clang + 'onnxruntime/core/providers/coreml/mlprogram_test_scripts/**', # test scripts only ] command = [ 'python', diff --git a/include/onnxruntime/core/graph/graph.h b/include/onnxruntime/core/graph/graph.h index 9289e14c17dd1..c51f38553c3b4 100644 --- a/include/onnxruntime/core/graph/graph.h +++ b/include/onnxruntime/core/graph/graph.h @@ -1408,6 +1408,11 @@ class Graph { // NOLINT(clang-analyzer-optin.performance.Padding): preserve exi RuntimeOptimizationRecordContainer& MutableRuntimeOptimizations() { return runtime_optimizations_; } + + // We don't run Graph::Resolve() on an ORT format model, but a compiling EP may copy initializers to its + // compiled model during partitioning, leaving them unused in the ORT Graph. To allow the memory to be freed + // we need to manually run the cleanup that would usually happen as part of Graph::Resolve. + Status RemovedUnusedInitializersOrtFormat(); #endif // !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD) // This friendship relationship should only be used to call Graph::Graph and @@ -1541,12 +1546,6 @@ class Graph { // NOLINT(clang-analyzer-optin.performance.Padding): preserve exi common::Status PerformTypeAndShapeInferencing(const ResolveOptions& options); - // Recursively find all subgraphs including nested subgraphs - void FindAllSubgraphs(std::vector& subgraphs); - - // Iterate this Graph instance and all subgraphs, calling the provided function for each. - common::Status ForThisAndAllSubgraphs(const std::vector& subgraphs, std::function func); - common::Status InferAndVerifyTypeMatch(Node& node, const ONNX_NAMESPACE::OpSchema& op, const ResolveOptions& options); // perform type and shape inferencing on the subgraph and Resolve to validate @@ -1576,9 +1575,6 @@ class Graph { // NOLINT(clang-analyzer-optin.performance.Padding): preserve exi // Implementation for initializer replacement Status ReplaceInitializedTensorImpl(ONNX_NAMESPACE::TensorProto new_initializer, bool is_external); - // Clear all unused initializers and NodeArgs - void CleanUnusedInitializersAndNodeArgs(const std::unordered_set* initializer_names_to_preserve = nullptr); - std::vector CreateNodeArgs(const google::protobuf::RepeatedPtrField& names, const ArgNameToTypeMap& name_to_type_map); @@ -1587,6 +1583,16 @@ class Graph { // NOLINT(clang-analyzer-optin.performance.Padding): preserve exi #endif // !defined(ORT_MINIMAL_BUILD) #if !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD) + + // Recursively find all subgraphs including nested subgraphs + void FindAllSubgraphs(std::vector& subgraphs); + + // Iterate this Graph instance and all subgraphs, calling the provided function for each. + common::Status ForThisAndAllSubgraphs(const std::vector& subgraphs, std::function func); + + // Clear all unused initializers and NodeArgs + void CleanUnusedInitializersAndNodeArgs(const std::unordered_set* initializer_names_to_preserve = nullptr); + Status PopulateNodeArgToProducerConsumerLookupsFromNodes(); template diff --git a/onnxruntime/core/graph/graph.cc b/onnxruntime/core/graph/graph.cc index 442a0db933d65..e950d68947b91 100644 --- a/onnxruntime/core/graph/graph.cc +++ b/onnxruntime/core/graph/graph.cc @@ -3254,27 +3254,6 @@ Status Graph::PerformTypeAndShapeInferencing(const ResolveOptions& options) { return Status::OK(); } -void Graph::FindAllSubgraphs(std::vector& subgraphs) { - for (auto& node : Nodes()) { - for (auto& subgraph : node.MutableSubgraphs()) { - subgraphs.push_back(subgraph.get()); - subgraph->FindAllSubgraphs(subgraphs); - } - } -} - -Status Graph::ForThisAndAllSubgraphs(const std::vector& subgraphs, std::function func) { - auto status = func(*this); - ORT_RETURN_IF_ERROR(status); - - for (auto& subgraph : subgraphs) { - status = func(*subgraph); - ORT_RETURN_IF_ERROR(status); - } - - return status; -} - Status Graph::Resolve(const ResolveOptions& options) { if (parent_graph_) { // Resolve must start at the top level graph in-order to handle outer scope @@ -3387,6 +3366,39 @@ void Graph::AddInitializedTensor(const TensorProto& tensor) { ORT_IGNORE_RETURN_VALUE(GetOrCreateNodeArg(tensor.name(), &t)); } } + +void Graph::FindAllSubgraphs(std::vector& subgraphs) { + for (auto& node : Nodes()) { + for (auto& subgraph : node.MutableSubgraphs()) { + subgraphs.push_back(subgraph.get()); + subgraph->FindAllSubgraphs(subgraphs); + } + } +} + +Status Graph::ForThisAndAllSubgraphs(const std::vector& subgraphs, std::function func) { + auto status = func(*this); + ORT_RETURN_IF_ERROR(status); + + for (auto& subgraph : subgraphs) { + status = func(*subgraph); + ORT_RETURN_IF_ERROR(status); + } + + return status; +} + +Status Graph::RemovedUnusedInitializersOrtFormat() { + std::vector all_subgraphs; + FindAllSubgraphs(all_subgraphs); + auto cleanup_func = [](Graph& graph) { + graph.CleanUnusedInitializersAndNodeArgs(nullptr); + return Status::OK(); + }; + + auto result = ForThisAndAllSubgraphs(all_subgraphs, cleanup_func); + return result; +} #endif // !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD) const std::string& Graph::Name() const noexcept { @@ -4122,6 +4134,9 @@ void Graph::ToGraphProtoInternal(ONNX_NAMESPACE::GraphProto& graph_proto) const } } +#endif // !defined(ORT_MINIMAL_BUILD) + +#if !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD) void Graph::CleanUnusedInitializersAndNodeArgs(const std::unordered_set* initializer_names_to_preserve) { // Node Args being used std::unordered_set used_args; @@ -4253,8 +4268,7 @@ void Graph::CleanUnusedInitializersAndNodeArgs(const std::unordered_set op = model_builder.CreateOperation(node, coreml_op_type); AddOperationInput(*op, "x", node.InputDefs()[0]->Name()); + + if (add_alpha) { + NodeAttrHelper helper(node); + const auto alpha = helper.Get("alpha", 0.01f); + AddOperationInput(*op, "alpha", model_builder.AddScalarConstant(op->type(), "alpha", alpha)); + } + AddOperationOutput(*op, *node.OutputDefs()[0]); model_builder.AddOperation(std::move(op)); @@ -198,7 +209,7 @@ bool ActivationOpBuilder::IsOpSupportedImpl(const Node& node, const OpBuilderInp #if defined(COREML_ENABLE_MLPROGRAM) if (input_params.create_mlprogram) { - if (op_type == "PRelu" || op_type == "LeakyRelu") { + if (op_type == "PRelu") { // TODO: ML Program supports this so should be easy to enable return false; } } else diff --git a/onnxruntime/core/providers/coreml/builders/impl/builder_utils.cc b/onnxruntime/core/providers/coreml/builders/impl/builder_utils.cc index ebb3f97895f06..e02186d3aee89 100644 --- a/onnxruntime/core/providers/coreml/builders/impl/builder_utils.cc +++ b/onnxruntime/core/providers/coreml/builders/impl/builder_utils.cc @@ -309,11 +309,33 @@ COREML_SPEC::MILSpec::NamedValueType CreateNamedTensorValueType(const NodeArg& n void AddOperationInput(MILSpec::Operation& op, std::string_view input_name, std::string_view value_name) { MILSpec::Argument arg; - arg.mutable_arguments()->Add()->set_name(std::string(value_name)); + arg.mutable_arguments()->Add()->set_name(value_name.data(), value_name.size()); (*op.mutable_inputs())[input_name] = std::move(arg); } +void AddOperationVariadicInput(MILSpec::Operation& op, std::string_view input_name, + const std::vector& value_names) { + MILSpec::Argument arg; + for (const auto& value : value_names) { + arg.mutable_arguments()->Add()->set_name(value.data(), value.size()); + } + + (*op.mutable_inputs())[input_name] = std::move(arg); +} + +void AddIntermediateOperationOutput(COREML_SPEC::MILSpec::Operation& op, std::string_view output_name, + int32_t element_type, std::optional> shape) { + auto& outputs = *op.mutable_outputs(); + auto& output_arg = *outputs.Add(); + output_arg.set_name(output_name.data(), output_name.size()); + + MILSpec::ValueType& value = *output_arg.mutable_type(); + MILSpec::TensorType& tensor_type = *value.mutable_tensortype(); + + SetTensorTypeInfo(tensor_type, OnnxDataTypeToMILSpec(element_type), shape, /*convert_scalar*/ true); +} + void AddOperationOutput(COREML_SPEC::MILSpec::Operation& op, const NodeArg& output, std::optional override_element_type) { auto& outputs = *op.mutable_outputs(); diff --git a/onnxruntime/core/providers/coreml/builders/impl/builder_utils.h b/onnxruntime/core/providers/coreml/builders/impl/builder_utils.h index f012e6af0d718..475ce79b0a812 100644 --- a/onnxruntime/core/providers/coreml/builders/impl/builder_utils.h +++ b/onnxruntime/core/providers/coreml/builders/impl/builder_utils.h @@ -129,6 +129,26 @@ COREML_SPEC::MILSpec::NamedValueType CreateNamedTensorValueType(const NodeArg& n void AddOperationInput(COREML_SPEC::MILSpec::Operation& op, std::string_view input_name, std::string_view value_name); +/// +/// Add a variadic input argument to a MILSpec::Operation +/// +/// Operation to update. +/// The input name defined by the spec for the operation. +/// The input value names. +void AddOperationVariadicInput(COREML_SPEC::MILSpec::Operation& op, std::string_view input_name, + const std::vector& value_names); + +/// Add an output to a MILSpec::Operation for an intermediate operation when the implementation is composed of +/// multiple MLProgram operations. In this case we don't have a NodeArg for the output. +/// +/// Operation to update. +/// Name of the intermediate output. Create using ModelBuilder::GetUniqueName. +/// onnx::TensorProto_DataType element type of the output. +/// int32_t as that is what TensorShapeProto uses to store the value. +/// Shape of the output if known. +void AddIntermediateOperationOutput(COREML_SPEC::MILSpec::Operation& op, std::string_view output_name, + int32_t element_type, std::optional> shape); + /// /// Add an output to a MILSpec::Operation. Name, data type and shape are used from the NodeArg. /// diff --git a/onnxruntime/core/providers/coreml/builders/impl/concat_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/concat_op_builder.cc index 34193318a0264..9ea0030290abd 100644 --- a/onnxruntime/core/providers/coreml/builders/impl/concat_op_builder.cc +++ b/onnxruntime/core/providers/coreml/builders/impl/concat_op_builder.cc @@ -4,6 +4,7 @@ #include "core/providers/common.h" #include "core/providers/coreml/builders/helper.h" #include "core/providers/coreml/builders/impl/base_op_builder.h" +#include "core/providers/coreml/builders/impl/builder_utils.h" #include "core/providers/coreml/builders/model_builder.h" #include "core/providers/coreml/builders/op_builder_factory.h" #include "core/providers/coreml/shape_utils.h" @@ -18,27 +19,51 @@ class ConcatOpBuilder : public BaseOpBuilder { bool IsOpSupportedImpl(const Node& node, const OpBuilderInputParams& input_params, const logging::Logger& logger) const override; + + bool SupportsMLProgram() const override { return true; } }; Status ConcatOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const Node& node, const logging::Logger& logger) const { - std::unique_ptr layer = model_builder.CreateNNLayer(node); - - layer->mutable_concat()->set_sequenceconcat(false); - - for (const auto* input : node.InputDefs()) { - LOGS(logger, VERBOSE) << "input name " << input->Name(); - *layer->mutable_input()->Add() = input->Name(); +#if defined(COREML_ENABLE_MLPROGRAM) + if (model_builder.CreateMLProgram()) { + using namespace CoreML::Specification::MILSpec; // NOLINT + + NodeAttrHelper helper(node); + const auto axis = helper.GetInt64("axis"); // required + const auto interleave = false; + + std::unique_ptr op = model_builder.CreateOperation(node, "concat"); + std::vector input_names; + for (const auto* input : node.InputDefs()) { + input_names.emplace_back(input->Name()); + } + AddOperationVariadicInput(*op, "values", input_names); + AddOperationInput(*op, "axis", model_builder.AddScalarConstant(op->type(), "axis", *axis)); + AddOperationInput(*op, "interleave", model_builder.AddScalarConstant(op->type(), "interleave", interleave)); + AddOperationOutput(*op, *node.OutputDefs()[0]); + model_builder.AddOperation(std::move(op)); + } else // NOLINT +#endif // defined(COREML_ENABLE_MLPROGRAM) + { + std::unique_ptr layer = model_builder.CreateNNLayer(node); + + layer->mutable_concat()->set_sequenceconcat(false); + + for (const auto* input : node.InputDefs()) { + LOGS(logger, VERBOSE) << "input name " << input->Name(); + *layer->mutable_input()->Add() = input->Name(); + } + + *layer->mutable_output()->Add() = node.OutputDefs()[0]->Name(); + + model_builder.AddLayer(std::move(layer)); } - - *layer->mutable_output()->Add() = node.OutputDefs()[0]->Name(); - - model_builder.AddLayer(std::move(layer)); return Status::OK(); } -bool ConcatOpBuilder::IsOpSupportedImpl(const Node& node, const OpBuilderInputParams& /* input_params */, +bool ConcatOpBuilder::IsOpSupportedImpl(const Node& node, const OpBuilderInputParams& input_params, const logging::Logger& logger) const { const auto& input_defs = node.InputDefs(); if (input_defs.size() < 2) { @@ -50,23 +75,25 @@ bool ConcatOpBuilder::IsOpSupportedImpl(const Node& node, const OpBuilderInputPa if (!GetShape(*input_defs[0], input_shape, logger)) return false; - auto rank = input_shape.size(); - if (rank != 4) { - // For some reason, the concat in CoreML running on 3d tensor will concat on wrong axis - // Instead of concat on axis 0, it will concat on axis 1 - // Disable Concat support for 3d tensor for now - // TODO, add ExpandDims and Squeeze, 3d -ExpandDims-> 4d -> Concat -Squeeze-> 3d - LOGS(logger, VERBOSE) << "Concat only support 4d shape for now, input is " - << rank << "d shape"; - return false; - } - - NodeAttrHelper helper(node); - auto axis = static_cast(HandleNegativeAxis(helper.Get("axis", 1), rank)); - if (rank != axis + 3) { - LOGS(logger, VERBOSE) << "Concat only support axis to be -3, actual axis: " << axis - << ", actual rank: " << rank; - return false; + if (!input_params.create_mlprogram) { + auto rank = input_shape.size(); + if (rank != 4) { + // For some reason, the concat in CoreML running on 3d tensor will concat on wrong axis + // Instead of concat on axis 0, it will concat on axis 1 + // Disable Concat support for 3d tensor for now + // TODO: add ExpandDims and Squeeze, 3d -ExpandDims-> 4d -> Concat -Squeeze-> 3d + LOGS(logger, VERBOSE) << "Concat only support 4d shape for now, input is " + << rank << "d shape"; + return false; + } + + NodeAttrHelper helper(node); + auto axis = static_cast(HandleNegativeAxis(helper.Get("axis", 1), rank)); + if (rank != axis + 3) { + LOGS(logger, VERBOSE) << "Concat only support axis to be -3, actual axis: " << axis + << ", actual rank: " << rank; + return false; + } } return true; diff --git a/onnxruntime/core/providers/coreml/builders/impl/depthtospace_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/depthtospace_op_builder.cc index 1eba312b2577b..bec2461ffbc52 100644 --- a/onnxruntime/core/providers/coreml/builders/impl/depthtospace_op_builder.cc +++ b/onnxruntime/core/providers/coreml/builders/impl/depthtospace_op_builder.cc @@ -4,6 +4,7 @@ #include "core/common/safeint.h" #include "core/providers/coreml/builders/helper.h" #include "core/providers/coreml/builders/impl/base_op_builder.h" +#include "core/providers/coreml/builders/impl/builder_utils.h" #include "core/providers/coreml/builders/model_builder.h" #include "core/providers/coreml/builders/op_builder_factory.h" #include "core/providers/coreml/shape_utils.h" @@ -18,52 +19,133 @@ class DepthToSpaceOpBuilder : public BaseOpBuilder { bool IsOpSupportedImpl(const Node& node, const OpBuilderInputParams& input_params, const logging::Logger& logger) const override; + + bool SupportsMLProgram() const override { return true; } }; Status DepthToSpaceOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const Node& node, - const logging::Logger& /* logger */) const { - std::unique_ptr layer = model_builder.CreateNNLayer(node); - + [[maybe_unused]] const logging::Logger& logger) const { const auto& input_defs = node.InputDefs(); const auto& output_defs = node.OutputDefs(); const auto& input_name = input_defs[0]->Name(); - const auto& output_name = output_defs[0]->Name(); - uint64_t blocksize = SafeInt(node.GetAttributes().at("blocksize").i()); + NodeAttrHelper helper(node); + int64_t blocksize = *helper.GetInt64("blocksize"); // required attribute + +#if defined(COREML_ENABLE_MLPROGRAM) + if (model_builder.CreateMLProgram()) { + using namespace CoreML::Specification::MILSpec; // NOLINT + + const auto mode = helper.Get("mode", "DCR"); + + if (mode == "DCR") { + // DCR is directly supported + // https://apple.github.io/coremltools/source/coremltools.converters.mil.mil.ops.defs.html#coremltools.converters.mil.mil.ops.defs.iOS15.tensor_transformation.depth_to_space + // Validated with depth_to_space.py. + auto op = model_builder.CreateOperation(node, "depth_to_space"); + AddOperationInput(*op, "x", input_name); + AddOperationInput(*op, "block_size", model_builder.AddScalarConstant(op->type(), "blocksize", blocksize)); + AddOperationOutput(*op, *output_defs[0]); + model_builder.AddOperation(std::move(op)); + } else { + // CRD is manual. there may be a perf cost from the Reshape's (typically that happens on CPU) but if the input + // is a fixed size hopefully CoreML is smart enough to handle that aspect during model compilation instead + // of execution. + + // https://github.com/onnx/onnx/blob/main/docs/Operators.md#depthtospace + // b, c, h, w = x.shape + // tmp = np.reshape(x, [b, c // (blocksize ** 2), blocksize, blocksize, h, w]) + // tmp = np.transpose(tmp, [0, 1, 4, 2, 5, 3]) + // y = np.reshape(tmp, [b, c // (blocksize ** 2), h * blocksize, w * blocksize]) + // + // CoreML has a 5D limit, so we merge the batch dim into the channel dim as that doesn't change the data + // movement. + // First reshape is to [b * c // (blocksize ** 2), blocksize, blocksize, h, w] + // Transpose is to [0, 3, 1, 4, 2] + + // we checked shape was static in IsOpSupportedImpl so this should never fail + std::vector input_shape; + ORT_RETURN_IF_NOT(GetStaticShape(*input_defs[0], input_shape, logger), "Failed to get input shape"); + const int32_t elem_type = static_cast(ONNX_NAMESPACE::TensorProto_DataType_FLOAT); + + // reshape to [b * c // (blocksize ** 2), blocksize, blocksize, h, w] + auto reshape1 = model_builder.CreateOperation(node, "reshape", "pre"); + std::vector shape1 = {input_shape[0] * input_shape[1] / (blocksize * blocksize), + blocksize, blocksize, input_shape[2], input_shape[3]}; + AddOperationInput(*reshape1, "x", input_name); + AddOperationInput(*reshape1, "shape", model_builder.AddConstant(reshape1->type(), "shape", shape1)); + const auto& reshape1_output = model_builder.GetUniqueName(node, "reshape1"); + AddIntermediateOperationOutput(*reshape1, reshape1_output, elem_type, shape1); + + // transpose to [0, 3, 1, 4, 2] + auto transpose = model_builder.CreateOperation(node, "transpose"); + std::vector perm = {0, 3, 1, 4, 2}; + std::vector shape2 = {shape1[0], shape1[3], shape1[1], shape1[4], shape1[2]}; + AddOperationInput(*transpose, "x", reshape1_output); + AddOperationInput(*transpose, "perm", model_builder.AddConstant(transpose->type(), "perm", perm)); + const auto& transpose_output = model_builder.GetUniqueName(node, "transpose"); + AddIntermediateOperationOutput(*transpose, transpose_output, elem_type, shape2); + + // reshape to [b, c // (blocksize ** 2), h * blocksize, w * blocksize] + auto reshape2 = model_builder.CreateOperation(node, "reshape", "post"); + std::vector shape3 = {input_shape[0], + input_shape[1] / (blocksize * blocksize), + input_shape[2] * blocksize, + input_shape[3] * blocksize}; + AddOperationInput(*reshape2, "x", transpose_output); + AddOperationInput(*reshape2, "shape", model_builder.AddConstant(reshape2->type(), "shape", shape3)); + + AddOperationOutput(*reshape2, *output_defs[0]); + + model_builder.AddOperation(std::move(reshape1)); + model_builder.AddOperation(std::move(transpose)); + model_builder.AddOperation(std::move(reshape2)); + } + } else // NOLINT +#endif // if defined(COREML_ENABLE_MLPROGRAM) + { + const auto& output_name = output_defs[0]->Name(); + std::unique_ptr layer = model_builder.CreateNNLayer(node); - auto* coreml_depthtospace = layer->mutable_reorganizedata(); - coreml_depthtospace->set_blocksize(blocksize); - coreml_depthtospace->set_mode(CoreML::Specification::ReorganizeDataLayerParams_ReorganizationType:: - ReorganizeDataLayerParams_ReorganizationType_DEPTH_TO_SPACE); + auto* coreml_depthtospace = layer->mutable_reorganizedata(); + coreml_depthtospace->set_blocksize(static_cast(blocksize)); + coreml_depthtospace->set_mode(CoreML::Specification::ReorganizeDataLayerParams_ReorganizationType:: + ReorganizeDataLayerParams_ReorganizationType_DEPTH_TO_SPACE); - *layer->mutable_input()->Add() = input_name; - *layer->mutable_output()->Add() = output_name; + *layer->mutable_input()->Add() = input_name; + *layer->mutable_output()->Add() = output_name; + + model_builder.AddLayer(std::move(layer)); + } - model_builder.AddLayer(std::move(layer)); return Status::OK(); } -bool DepthToSpaceOpBuilder::IsOpSupportedImpl(const Node& node, const OpBuilderInputParams& /*input_params*/, +bool DepthToSpaceOpBuilder::IsOpSupportedImpl(const Node& node, const OpBuilderInputParams& input_params, const logging::Logger& logger) const { const auto& input_defs = node.InputDefs(); std::vector input_shape; if (!GetShape(*input_defs[0], input_shape, logger)) { + LOGS(logger, VERBOSE) << "DepthToSpace: no input shape"; return false; } - const auto input_rank = input_shape.size(); - if (input_rank < 4) { - LOGS(logger, VERBOSE) << "DepthToSpace does not support input shape of " << input_rank << "d shape."; - } + // ONNX and CoreML both require 4D input so no need to check the shape here. NodeAttrHelper helper(node); - if (node.SinceVersion() >= 11) { - // For now, only DCR mode DepthToSpace is supported - const auto mode = helper.Get("mode", "DCR"); + const auto mode = helper.Get("mode", "DCR"); + + if (input_params.create_mlprogram) { + if (mode == "CRD" && !IsStaticShape(input_shape)) { + // we need to manually implement the logic with a Reshape, so we need to know the shape to do that + LOGS(logger, VERBOSE) << "DepthToSpace: CRD mode requires static shape"; + return false; + } + } else { if (mode != "DCR") { - LOGS(logger, VERBOSE) << "The mode: " << mode << "of DepthToSpace is not supported in CoreML EP for now."; + LOGS(logger, VERBOSE) << "DepthToSpace: " << mode << " mode is not supported"; return false; } } diff --git a/onnxruntime/core/providers/coreml/builders/impl/gridsample_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/gridsample_op_builder.cc index bfc665e0ac716..9caec290ea5a2 100644 --- a/onnxruntime/core/providers/coreml/builders/impl/gridsample_op_builder.cc +++ b/onnxruntime/core/providers/coreml/builders/impl/gridsample_op_builder.cc @@ -19,8 +19,8 @@ std::string_view GetMode(const NodeAttrHelper& helper) { // opset 20+ uses linear, nearest, cubic // bilinear is what CoreML uses, so prefer that // bicubic/cubic isn't supported - - const auto& mode = helper.Get("mode", "linear"); + static const std::string default_mode = "linear"; // static in case we ever return the default as a string_view + const auto& mode = helper.Get("mode", default_mode); if (mode == "linear") { return "bilinear"; } diff --git a/onnxruntime/core/providers/coreml/builders/op_builder_factory.cc b/onnxruntime/core/providers/coreml/builders/op_builder_factory.cc index 535712f096010..b0006b24e7d75 100644 --- a/onnxruntime/core/providers/coreml/builders/op_builder_factory.cc +++ b/onnxruntime/core/providers/coreml/builders/op_builder_factory.cc @@ -15,28 +15,28 @@ namespace coreml { static OpBuilderRegistrations CreateOpBuilderRegistrations() { OpBuilderRegistrations op_registrations; + // Activations + CreateActivationOpBuilder("Sigmoid", op_registrations); + CreateActivationOpBuilder("Tanh", op_registrations); + CreateActivationOpBuilder("Relu", op_registrations); + CreateActivationOpBuilder("PRelu", op_registrations); + CreateActivationOpBuilder("LeakyRelu", op_registrations); + // Unary ops - CreateUnaryOpBuilder("Sqrt", op_registrations); CreateUnaryOpBuilder("Reciprocal", op_registrations); + CreateUnaryOpBuilder("Sqrt", op_registrations); // Binary elementwise ops CreateBinaryOpBuilder("Add", op_registrations); + CreateBinaryOpBuilder("Div", op_registrations); CreateBinaryOpBuilder("Mul", op_registrations); CreateBinaryOpBuilder("Pow", op_registrations); CreateBinaryOpBuilder("Sub", op_registrations); - CreateBinaryOpBuilder("Div", op_registrations); - - // Activations - CreateActivationOpBuilder("Sigmoid", op_registrations); - CreateActivationOpBuilder("Tanh", op_registrations); - CreateActivationOpBuilder("Relu", op_registrations); - CreateActivationOpBuilder("PRelu", op_registrations); - CreateActivationOpBuilder("LeakyRelu", op_registrations); // Pooling ops + CreatePoolOpBuilder("AveragePool", op_registrations); CreatePoolOpBuilder("GlobalAveragePool", op_registrations); CreatePoolOpBuilder("GlobalMaxPool", op_registrations); - CreatePoolOpBuilder("AveragePool", op_registrations); CreatePoolOpBuilder("MaxPool", op_registrations); // Reduction ops @@ -54,6 +54,7 @@ static OpBuilderRegistrations CreateOpBuilderRegistrations() { CreateFlattenOpBuilder("Flatten", op_registrations); CreateGatherOpBuilder("Gather", op_registrations); CreateGemmOpBuilder("Gemm", op_registrations); + CreateGridSampleOpBuilder("GridSample", op_registrations); CreateLRNOpBuilder("LRN", op_registrations); CreateGemmOpBuilder("MatMul", op_registrations); CreatePadOpBuilder("Pad", op_registrations); @@ -66,8 +67,6 @@ static OpBuilderRegistrations CreateOpBuilderRegistrations() { CreateSqueezeOpBuilder("Squeeze", op_registrations); CreateTransposeOpBuilder("Transpose", op_registrations); - CreateGridSampleOpBuilder("GridSample", op_registrations); - return op_registrations; } diff --git a/onnxruntime/core/providers/coreml/coreml_execution_provider.cc b/onnxruntime/core/providers/coreml/coreml_execution_provider.cc index a92fef81ac395..f2cd4d01174d3 100644 --- a/onnxruntime/core/providers/coreml/coreml_execution_provider.cc +++ b/onnxruntime/core/providers/coreml/coreml_execution_provider.cc @@ -83,7 +83,9 @@ CoreMLExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph_vie }; result = utils::CreateSupportedPartitions(graph_viewer, supported_nodes, {}, - gen_metadef_name, COREML, kCoreMLExecutionProvider); + gen_metadef_name, COREML, kCoreMLExecutionProvider, + nullptr, + /*drop_constant_initializers*/ true); const auto num_of_partitions = result.size(); const auto num_of_supported_nodes = std::transform_reduce( diff --git a/onnxruntime/core/providers/coreml/DebugMLProgram.md b/onnxruntime/core/providers/coreml/mlprogram_test_scripts/DebugMLProgram.md similarity index 97% rename from onnxruntime/core/providers/coreml/DebugMLProgram.md rename to onnxruntime/core/providers/coreml/mlprogram_test_scripts/DebugMLProgram.md index e41a515594303..b7a54466ab8dd 100644 --- a/onnxruntime/core/providers/coreml/DebugMLProgram.md +++ b/onnxruntime/core/providers/coreml/mlprogram_test_scripts/DebugMLProgram.md @@ -25,6 +25,8 @@ https://apple.github.io/coremltools/docs-guides/source/model-intermediate-langua Usage is reasonably intuitive. The below example defines a model with 2 inputs and a matmul operator. The model is printed, and run with randomly generated inputs. The output from doing so is printed. +There are additional test scripts in this directory for different operators. + ```python import numpy as np import coremltools as ct diff --git a/onnxruntime/core/providers/coreml/mlprogram_test_scripts/concat_test.py b/onnxruntime/core/providers/coreml/mlprogram_test_scripts/concat_test.py new file mode 100644 index 0000000000000..430a2b3fa3ed0 --- /dev/null +++ b/onnxruntime/core/providers/coreml/mlprogram_test_scripts/concat_test.py @@ -0,0 +1,33 @@ +import coremltools as ct +import numpy as np +from coremltools.converters.mil import Builder as mb + +target = ct.target.iOS15 + +a_shape = (1, 1, 3, 3) + + +@mb.program( + input_specs=[mb.TensorSpec(shape=a_shape), mb.TensorSpec(shape=a_shape), mb.TensorSpec(shape=a_shape)], + opset_version=target, +) +def prog(x, y, z): + axis = mb.const(val=1) + interleave = mb.const(val=False) + z = mb.concat(values=(x, y, z), axis=axis, interleave=interleave) + return z + + +print(prog) + +# Convert to ML program +m = ct.convert(prog, minimum_deployment_target=target, compute_precision=ct.precision.FLOAT32) + +x = np.random.rand(*a_shape) +y = np.random.rand(*a_shape) +z = np.random.rand(*a_shape) + +# spec = m.get_spec() +# print(spec) + +print(m.predict({"x": x, "y": y, "z": z})) diff --git a/onnxruntime/core/providers/coreml/mlprogram_test_scripts/convtranspose_test.py b/onnxruntime/core/providers/coreml/mlprogram_test_scripts/convtranspose_test.py new file mode 100644 index 0000000000000..2c8cbc4948a6b --- /dev/null +++ b/onnxruntime/core/providers/coreml/mlprogram_test_scripts/convtranspose_test.py @@ -0,0 +1,42 @@ +import coremltools as ct +import numpy as np +from coremltools.converters.mil import Builder as mb + +target = ct.target.iOS15 + +x_shape = (1, 3, 4, 4) +w_shape = (3, 3, 3, 3) + + +@mb.program(input_specs=[mb.TensorSpec(shape=x_shape)], opset_version=target) +def prog(x): + weight = mb.const(name="weight", val=np.ones(w_shape, dtype=np.float32)) + output_shape = mb.const(name="output_shape", val=np.array([1, 3, 4, 4])) + # pad = mb.const(val=np.zeros((4), dtype=np.int32)) + strides = mb.const(name="strides", val=np.ones((2), dtype=np.int32)) + dilations = mb.const(name="dilations", val=np.ones((2), dtype=np.int32)) + z = mb.conv_transpose( + x=x, weight=weight, strides=strides, dilations=dilations, output_shape=output_shape + ) # , pad=pad + + return z + + +print(prog) + +# Convert to ML program +m = ct.convert(prog, minimum_deployment_target=target, compute_precision=ct.precision.FLOAT32) + +# spec = m.get_spec() +# print(spec) + +m.save("ConvTranspose.mlpackage") +# construct MLModel with compute_units=ComputeUnit.CPU and run predict +m_cpu = ct.models.MLModel("ConvTranspose.mlpackage", compute_units=ct.ComputeUnit.CPU_ONLY) +m_all = ct.models.MLModel("ConvTranspose.mlpackage", compute_units=ct.ComputeUnit.ALL) + +x = np.ones(x_shape, dtype=np.float32) +print("CPU_ONLY") +print(m_cpu.predict({"x": x})) +print("ALL") +print(m_all.predict({"x": x})) diff --git a/onnxruntime/core/providers/coreml/mlprogram_test_scripts/depthtospace_test.py b/onnxruntime/core/providers/coreml/mlprogram_test_scripts/depthtospace_test.py new file mode 100644 index 0000000000000..593d9e8bbf66a --- /dev/null +++ b/onnxruntime/core/providers/coreml/mlprogram_test_scripts/depthtospace_test.py @@ -0,0 +1,51 @@ +import coremltools as ct +import numpy as np +from coremltools.converters.mil import Builder as mb + +target = ct.target.iOS15 + +# replicate example from https://github.com/onnx/onnx/blob/main/docs/Operators.md#depthtospace +# to prove CoreML mode is DCR +x_shape = (1, 8, 2, 3) + + +@mb.program(input_specs=[mb.TensorSpec(shape=x_shape)], opset_version=target) +def prog(x): + block_size = mb.const(name="block_size", val=2) + z = mb.depth_to_space(x=x, block_size=block_size) + return z + + +print(prog) + +# Convert to ML program +m = ct.convert(prog, minimum_deployment_target=target, compute_precision=ct.precision.FLOAT32) + +# spec = m.get_spec() +# print(spec) + +m.save("DepthToSpace.mlpackage") + +# also check for differences between CPU_ONLY and ALL +m_cpu = ct.models.MLModel("DepthToSpace.mlpackage", compute_units=ct.ComputeUnit.CPU_ONLY) +m_all = ct.models.MLModel("DepthToSpace.mlpackage", compute_units=ct.ComputeUnit.ALL) + +x = np.array( + [ + [ + [[0.0, 1.0, 2.0], [3.0, 4.0, 5.0]], + [[9.0, 10.0, 11.0], [12.0, 13.0, 14.0]], + [[18.0, 19.0, 20.0], [21.0, 22.0, 23.0]], + [[27.0, 28.0, 29.0], [30.0, 31.0, 32.0]], + [[36.0, 37.0, 38.0], [39.0, 40.0, 41.0]], + [[45.0, 46.0, 47.0], [48.0, 49.0, 50.0]], + [[54.0, 55.0, 56.0], [57.0, 58.0, 59.0]], + [[63.0, 64.0, 65.0], [66.0, 67.0, 68.0]], + ] + ] +).astype(np.float32) + +print("CPU_ONLY") +print(m_cpu.predict({"x": x})) +print("ALL") +print(m_all.predict({"x": x})) diff --git a/onnxruntime/core/providers/coreml/mlprogram_test_scripts/div_test.py b/onnxruntime/core/providers/coreml/mlprogram_test_scripts/div_test.py new file mode 100644 index 0000000000000..a0423511598ff --- /dev/null +++ b/onnxruntime/core/providers/coreml/mlprogram_test_scripts/div_test.py @@ -0,0 +1,103 @@ +import coremltools as ct +import numpy as np +from coremltools.converters.mil import Builder as mb +from coremltools.models import datatypes +from coremltools.models.neural_network import NeuralNetworkBuilder +from coremltools.models.utils import save_spec + +input_dim = (1,) +output_dim = (1,) + + +def mlprogram(): + target = ct.target.iOS15 + + @mb.program(input_specs=[mb.TensorSpec(shape=input_dim), mb.TensorSpec(shape=input_dim)], opset_version=target) + def prog(x, y): + return mb.real_div(x=x, y=y) + + # print(prog) + + # Convert to ML program + m = ct.convert(prog, minimum_deployment_target=target) + + x = np.array([2], dtype=np.float32) + y = np.array([2047], dtype=np.float32) + + # spec = m.get_spec() + # print(spec) + + print(m.predict({"x": x, "y": y})) + + +# implement Div with coremltools approach of x * (1/y) +def nn(): + input_features = [("x", datatypes.Array(*input_dim)), ("y_inv", datatypes.Array(*input_dim))] + output_features = [("final", datatypes.Array(*output_dim))] + + # Build a simple neural network with 1 inner product layer + builder = NeuralNetworkBuilder(input_features, output_features) + builder.add_elementwise( + name="x_multiply_inverse_of_y", + input_names=["x", "y_inv"], + output_name="final", + mode="MULTIPLY", + ) + + save_spec(builder.spec, "network.mlmodel") + m = ct.models.MLModel("network.mlmodel") + + x = np.array([2], dtype=np.float32) + y = np.array([1 / 2047], dtype=np.float32) + print(m.predict({"x": x, "y_inv": y})) + + +def nn_scale(): + input_features = [ + ("x", datatypes.Array(*input_dim)), + ("y_inv", datatypes.Array(*input_dim)), + ("z", datatypes.Array(*input_dim)), + ] + output_features = [("final", datatypes.Array(*output_dim))] + + builder = NeuralNetworkBuilder(input_features, output_features) + + builder.add_elementwise( + name="div_implemented_as_x_multiply_inverse_of_y", + input_names=["x", "y_inv"], + output_name="div_result", + mode="MULTIPLY", + ) + + builder.add_elementwise( + name="apply_scaling_factor", + input_names=["div_result", "z"], + output_name="final", + mode="MULTIPLY", + ) + + from coremltools.models.utils import save_spec + + save_spec(builder.spec, "network.mlmodel") + m = ct.models.MLModel("network.mlmodel") + + a = 2 + b = 2047 + # scaling factor to test working around coremltools inaccuracy. + # weirdly even a scaling factor of 1 fixes the problem from https://github.com/microsoft/onnxruntime/issues/21170 + c = 1000 + + x = np.array([a], dtype=np.float32) + y = np.array([1 / b / c], dtype=np.float32) + z = np.array([c], dtype=np.float32) + print(m.predict({"x": x, "y_inv": y, "z": z})) + + +print("NN") +nn() + +print("\nNN with scaling") +nn_scale() + +print("\nML Program") +mlprogram() diff --git a/onnxruntime/core/providers/coreml/dump_mlprogram_model.py b/onnxruntime/core/providers/coreml/mlprogram_test_scripts/dump_mlprogram_model.py similarity index 100% rename from onnxruntime/core/providers/coreml/dump_mlprogram_model.py rename to onnxruntime/core/providers/coreml/mlprogram_test_scripts/dump_mlprogram_model.py diff --git a/onnxruntime/core/providers/coreml/mlprogram_test_scripts/gridsample_test.py b/onnxruntime/core/providers/coreml/mlprogram_test_scripts/gridsample_test.py new file mode 100644 index 0000000000000..5ce79c204c00c --- /dev/null +++ b/onnxruntime/core/providers/coreml/mlprogram_test_scripts/gridsample_test.py @@ -0,0 +1,114 @@ +import coremltools as ct +import numpy as np +from coremltools.converters.mil import Builder as mb + +target = ct.target.iOS15 + +x_shape = (2, 2, 3, 2) +grid_shape = (2, 3, 2, 2) + + +@mb.program(input_specs=[mb.TensorSpec(shape=x_shape), mb.TensorSpec(shape=grid_shape)], opset_version=target) +def prog(x, grid): + sampling = mb.const(name="sampling_mode", val="bilinear") + padding_mode = mb.const(name="pmode", val="reflection") + pad = mb.const(name="pval", val=np.float32(0)) + coord_mode = mb.const(name="coord_mode", val="normalized_minus_one_to_one") + align_corners = mb.const(name="align_corners", val=False) + z = mb.resample( + x=x, + coordinates=grid, + sampling_mode=sampling, + padding_mode=padding_mode, + padding_value=pad, + coordinates_mode=coord_mode, + align_corners=align_corners, + ) + + return z + + +# print(prog) + +# Convert to ML program +m = ct.convert(prog, minimum_deployment_target=target, compute_precision=ct.precision.FLOAT32) + +# spec = m.get_spec() +# print(spec) + +m.save("GridSample.mlpackage") +# construct MLModel with compute_units=ComputeUnit.CPU and run predict +m_cpu = ct.models.MLModel("GridSample.mlpackage", compute_units=ct.ComputeUnit.CPU_ONLY) +m_all = ct.models.MLModel("GridSample.mlpackage", compute_units=ct.ComputeUnit.ALL) + +# GridSampleTest.test_grid_sample_20_4D_bilinear_reflection_no_align_corners +# ORT produces different output for this test. ORT output is generated by pytorch +x = ( + np.array( + [ + -0.173652, + -1.513725, + -0.704586, + -1.952375, + -0.699404, + -0.806298, + 1.640852, + -0.138969, + -0.695411, + -1.352111, + 0.568797, + -0.564294, + -0.056468, + 0.641604, + -0.438370, + 0.450167, + -1.091401, + 1.669729, + -0.908544, + 0.244467, + 0.172109, + 1.156741, + -0.617128, + 1.155460, + ] + ) + .astype(np.float32) + .reshape(x_shape) +) + +grid = ( + np.array( + [ + 0.252250, + -0.151452, + 0.824706, + -0.588292, + -0.591147, + -0.155082, + -0.732938, + 0.457493, + -0.439559, + 0.492330, + 0.696447, + 0.700722, + -0.220298, + 0.654884, + -0.635434, + -1.195619, + -0.114204, + -0.870080, + -0.929674, + 0.305035, + 1.025429, + -0.472240, + -0.067881, + -0.869393, + ] + ) + .astype(np.float32) + .reshape(grid_shape) +) + + +print(m_cpu.predict({"x": x, "grid": grid})) +print(m_all.predict({"x": x, "grid": grid})) diff --git a/onnxruntime/core/providers/coreml/mlprogram_test_scripts/resize_test.py b/onnxruntime/core/providers/coreml/mlprogram_test_scripts/resize_test.py new file mode 100644 index 0000000000000..f83dc6ddfe02f --- /dev/null +++ b/onnxruntime/core/providers/coreml/mlprogram_test_scripts/resize_test.py @@ -0,0 +1,51 @@ +import coremltools as ct +import numpy as np +from coremltools.converters.mil import Builder as mb + +target = ct.target.iOS15 + +x_shape = (1, 1, 3, 6) + +use_scale = False # set this to test upsample vs resize + + +@mb.program(input_specs=[mb.TensorSpec(shape=x_shape)], opset_version=target) +def prog(x): + global use_scale # noqa + + if use_scale: + align = mb.const(val=False) + scale_h = mb.const(val=float(1 / 3)) + scale_w = mb.const(val=float(1 / 3)) + z = mb.upsample_bilinear(x=x, scale_factor_height=scale_h, scale_factor_width=scale_w, align_corners=align) + else: + size_h = mb.const(val=1) + size_w = mb.const(val=2) + sampling_mode = mb.const(val="UNALIGN_CORNERS") + z = mb.resize_bilinear(x=x, target_size_height=size_h, target_size_width=size_w, sampling_mode=sampling_mode) + + return z + + +print(prog) + +# Convert to ML program +m = ct.convert(prog, minimum_deployment_target=target, compute_precision=ct.precision.FLOAT32) + +x = np.array( + [ + [ + [ + [1, 2, 3, 4, 5, 6], + [7, 8, 9, 10, 11, 12], + [13, 14, 15, 16, 17, 18], + ] + ] + ], + dtype=np.float32, +) + +# spec = m.get_spec() +# print(spec) + +print(m.predict({"x": x})) diff --git a/onnxruntime/core/providers/partitioning_utils.cc b/onnxruntime/core/providers/partitioning_utils.cc index c45f5cd0848dd..83c08f3dbd25e 100644 --- a/onnxruntime/core/providers/partitioning_utils.cc +++ b/onnxruntime/core/providers/partitioning_utils.cc @@ -88,8 +88,6 @@ It is required to ensure we do not break up a QDQ node unit during partitioning. @param graph_viewer GraphViewer that IExecutionProvider::GetCapability is called with. @param is_node_supported_fn Callback to check whether a node is supported. @param on_group_closed_fn Callback to indicate a completed partition node group. -@param debug_output Print diagnostic output about the partitions and reasons for partition breaks. - No-op in a release build. @return The partition node groups. */ std::vector> CreateSupportedPartitionNodeGroups( @@ -97,12 +95,7 @@ std::vector> CreateSupportedPartitionNodeGroups( const IsNodeSupportedFn& is_node_supported_fn, const OnGroupClosedFn& on_group_closed_fn, const std::string& execution_provider_type, - const std::unordered_map* node_unit_map, - bool debug_output) { -#ifdef NDEBUG - ORT_UNUSED_PARAMETER(debug_output); -#endif - + const std::unordered_map* node_unit_map) { ORT_ENFORCE(is_node_supported_fn, "Node support test is required."); /* @@ -146,12 +139,10 @@ std::vector> CreateSupportedPartitionNodeGroups( auto close_group = [&]() { if (!supported_group.empty()) { #ifndef NDEBUG - if (debug_output) { - LOGS_DEFAULT(VERBOSE) << "New partition node group.\n" - << "Unsupported nodes on group border: " - << NodeGroupDebugString(nodes_to_process_with_next_group, true) << "\n" - << "Nodes in group: " << NodeGroupDebugString(supported_group); - } + LOGS_DEFAULT(VERBOSE) << "New partition node group.\n" + << "Unsupported nodes on group border: " + << NodeGroupDebugString(nodes_to_process_with_next_group, true) << "\n" + << "Nodes in group: " << NodeGroupDebugString(supported_group); #endif // if no on_group_closed_fn callback was given, keep the partition @@ -163,7 +154,7 @@ std::vector> CreateSupportedPartitionNodeGroups( } #ifndef NDEBUG else { - LOGS_DEFAULT_IF(debug_output, VERBOSE) << "Discarded partition node group."; + LOGS_DEFAULT(VERBOSE) << "Discarded partition node group."; } #endif @@ -291,7 +282,8 @@ InlinedHashSet CreateExcludedNodeSet(const GraphViewer& graph_viewe std::unique_ptr MakeComputeCapability(const GraphViewer& graph_viewer, const std::vector& group, const GenerateMetadefNameFn& generate_metadef_name, - const std::string& execution_provider_name) { + const std::string& execution_provider_name, + bool drop_constant_initializers) { std::unordered_set node_set; node_set.reserve(group.size()); node_set.insert(group.cbegin(), group.cend()); @@ -354,6 +346,10 @@ std::unique_ptr MakeComputeCapability(const GraphViewer& grap meta_def->status = ONNX_NAMESPACE::EXPERIMENTAL; for (const auto& input : ordered_subgraph_inputs) { + if (drop_constant_initializers && graph_viewer.IsConstantInitializer(input->Name(), true)) { + continue; + } + meta_def->inputs.push_back(input->Name()); } @@ -374,13 +370,12 @@ CreateSupportedPartitions(const GraphViewer& graph_viewer, const std::string& execution_provider_name, const std::string& execution_provider_type, const std::unordered_map* node_unit_map, - bool debug_output) { + bool drop_constant_initializers) { const auto groups = CreateSupportedPartitionNodeGroups(graph_viewer, is_node_supported_fn, on_partition_closed_fn, execution_provider_type, - node_unit_map, - debug_output); + node_unit_map); std::vector> partitions{}; partitions.reserve(groups.size()); @@ -390,7 +385,7 @@ CreateSupportedPartitions(const GraphViewer& graph_viewer, std::back_inserter(partitions), [&](const auto& supported_partition) { return MakeComputeCapability(graph_viewer, supported_partition, generate_metadef_name_fn, - execution_provider_name); + execution_provider_name, drop_constant_initializers); }); return partitions; @@ -404,7 +399,7 @@ CreateSupportedPartitions(const GraphViewer& graph_viewer, const std::string& execution_provider_name, const std::string& execution_provider_type, const std::unordered_map* node_unit_map, - bool debug_output) { + bool drop_constant_initializers) { const auto excluded_nodes = CreateExcludedNodeSet(graph_viewer, stop_ops); const bool check_excluded_nodes = !excluded_nodes.empty(); @@ -419,7 +414,7 @@ CreateSupportedPartitions(const GraphViewer& graph_viewer, execution_provider_name, execution_provider_type, node_unit_map, - debug_output); + drop_constant_initializers); } } // namespace utils diff --git a/onnxruntime/core/providers/partitioning_utils.h b/onnxruntime/core/providers/partitioning_utils.h index c3f6b104e3f6a..235a88cfdb8a5 100644 --- a/onnxruntime/core/providers/partitioning_utils.h +++ b/onnxruntime/core/providers/partitioning_utils.h @@ -62,9 +62,10 @@ Create the supported partitions for the execution provider. @param execution_provider_type ExecutionProviderType of the EP creating this ComputeCapability instance. @param node_unit_map Map of each Node in the graph_viewer to its NodeUnit. Provide if EP handles QDQ format models. Should be created by EP calling GetAllNodeUnits. -@param debug_output Print diagnostic output about the partitions and reasons for partition breaks. - No-op in a release build. - +@param drop_constant_initializer Drop constant initializers from input to a ComputeCapability. + Set to true if constant initializers have been copied into a compiled model to allow + ORT to free the initializer. If the initializer remains as an input it will appear to + still be in-use. @returns ComputeCapability instances for all partitions assigned to the execution provider. */ std::vector> @@ -74,8 +75,8 @@ CreateSupportedPartitions(const GraphViewer& graph_viewer, const GenerateMetadefNameFn& generate_metadef_name_fn, const std::string& execution_provider_name, const std::string& execution_provider_type, - const std::unordered_map* node_unit_map = nullptr, - bool debug_output = false); + const std::unordered_map* node_unit_map, + bool drop_constant_initializers = false); /** Create the supported partitions for the execution provider. @@ -88,9 +89,10 @@ Create the supported partitions for the execution provider. @param execution_provider_type ExecutionProviderType of the EP creating this ComputeCapability instance. @param node_unit_map Map of each Node in the graph_viewer to its NodeUnit. Provide if EP handles QDQ format models. Should be created by EP calling GetAllNodeUnits. -@param debug_output Print diagnostic output about the partitions and reasons for partition breaks. - No-op in a release build. - +@param drop_constant_initializer Drop constant initializers from input to a ComputeCapability. + Set to true if constant initializers have been copied into a compiled model to allow + ORT to free the initializer. If the initializer remains as an input it will appear to + still be in-use. @returns ComputeCapability instances for all partitions assigned to the execution provider. */ std::vector> @@ -100,8 +102,8 @@ CreateSupportedPartitions(const GraphViewer& graph_viewer, const GenerateMetadefNameFn& generate_metadef_name, const std::string& execution_provider_name, const std::string& execution_provider_type, - const std::unordered_map* node_unit_map = nullptr, - bool debug_output = false); + const std::unordered_map* node_unit_map, + bool drop_constant_initializers = false); /** Create a ComputeCapability instance from the group of nodes. @@ -120,7 +122,8 @@ Will automatically determine the inputs and outputs required. std::unique_ptr MakeComputeCapability(const GraphViewer& graph_viewer, const std::vector& group, const GenerateMetadefNameFn& generate_metadef_name, - const std::string& execution_provider_name); + const std::string& execution_provider_name, + bool drop_constant_initializers); /** Create the set of nodes to exclude based on a set of stop ops. diff --git a/onnxruntime/core/providers/qnn/qnn_execution_provider.cc b/onnxruntime/core/providers/qnn/qnn_execution_provider.cc index 0ddaa97694217..539b456cb657f 100644 --- a/onnxruntime/core/providers/qnn/qnn_execution_provider.cc +++ b/onnxruntime/core/providers/qnn/qnn_execution_provider.cc @@ -565,7 +565,8 @@ static void PartitionCtxModel(const onnxruntime::GraphViewer& graph_viewer, supported_groups.begin(), supported_groups.end(), std::back_inserter(result), [&](const auto& supported_partition) { - return utils::MakeComputeCapability(graph_viewer, supported_partition, gen_metadef_name, QNN); + return utils::MakeComputeCapability(graph_viewer, supported_partition, gen_metadef_name, QNN, + /*drop_constant_initializers*/ false); // TODO: could this be set to true? }); const size_t num_of_partitions = result.size(); @@ -660,7 +661,7 @@ QNNExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph_viewer // Create partitions from supported nodes. std::vector> partitions = utils::CreateSupportedPartitions( - graph_viewer, supported_nodes, {}, gen_metadef_name, QNN, kQnnExecutionProvider, &node_unit_map, true); + graph_viewer, supported_nodes, {}, gen_metadef_name, QNN, kQnnExecutionProvider, &node_unit_map); // Filter out partitions that consist of a single QuantizeLinear or DequantizeLinear node. // We also count the number of supported nodes in all valid partitions. diff --git a/onnxruntime/core/session/inference_session.cc b/onnxruntime/core/session/inference_session.cc index cc3a9943ca0a3..5ad2f08467792 100644 --- a/onnxruntime/core/session/inference_session.cc +++ b/onnxruntime/core/session/inference_session.cc @@ -1603,6 +1603,11 @@ Status PartitionOrtFormatModel(onnxruntime::Graph& graph, logger, GraphPartitioner::Mode::kOrtFormatLoad)); +#if !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD) + // a compiling EP (e.g. CoreML) may copy initializers to its own memory. run the cleanup of unused initializers + // so that they can be freed. + ORT_RETURN_IF_ERROR(graph.RemovedUnusedInitializersOrtFormat()); +#endif return Status::OK(); } diff --git a/onnxruntime/python/onnxruntime_pybind_state.cc b/onnxruntime/python/onnxruntime_pybind_state.cc index 679ccce7fb07a..ffcd339c0ca3a 100644 --- a/onnxruntime/python/onnxruntime_pybind_state.cc +++ b/onnxruntime/python/onnxruntime_pybind_state.cc @@ -40,6 +40,10 @@ #include // for CUDNN_MAJOR #endif +#if defined(USE_COREML) +#include "core/providers/coreml/coreml_provider_factory.h" +#endif + #include // Explicitly provide a definition for the static const var 'GPU' in the OrtDevice struct, @@ -1161,7 +1165,30 @@ std::unique_ptr CreateExecutionProviderInstance( #if !defined(__APPLE__) LOGS_DEFAULT(WARNING) << "CoreML execution provider can only be used to generate ORT format model in this build."; #endif - return onnxruntime::CoreMLProviderFactoryCreator::Create(0)->CreateProvider(); + uint32_t coreml_flags = 0; + + const auto it = provider_options_map.find(type); + if (it != provider_options_map.end()) { + const ProviderOptions& options = it->second; + auto flags = options.find("flags"); + if (flags != options.end()) { + const auto& flags_str = flags->second; + + if (flags_str.find("COREML_FLAG_USE_CPU_ONLY") != std::string::npos) { + coreml_flags |= COREMLFlags::COREML_FLAG_USE_CPU_ONLY; + } + + if (flags_str.find("COREML_FLAG_ONLY_ALLOW_STATIC_INPUT_SHAPES") != std::string::npos) { + coreml_flags |= COREMLFlags::COREML_FLAG_ONLY_ALLOW_STATIC_INPUT_SHAPES; + } + + if (flags_str.find("COREML_FLAG_CREATE_MLPROGRAM") != std::string::npos) { + coreml_flags |= COREMLFlags::COREML_FLAG_CREATE_MLPROGRAM; + } + } + } + + return onnxruntime::CoreMLProviderFactoryCreator::Create(coreml_flags)->CreateProvider(); #endif } else if (type == kXnnpackExecutionProvider) { #if defined(USE_XNNPACK) diff --git a/onnxruntime/test/optimizer/qdq_transformer_test.cc b/onnxruntime/test/optimizer/qdq_transformer_test.cc index fb85eb4c29bb6..367b4a65e3b7b 100644 --- a/onnxruntime/test/optimizer/qdq_transformer_test.cc +++ b/onnxruntime/test/optimizer/qdq_transformer_test.cc @@ -3693,7 +3693,8 @@ TEST(QDQTransformerTests, QDQ_Selector_Test) { const auto compute_capability = utils::MakeComputeCapability( whole_graph_viewer, nodes, []() { return "sub_graph"; }, - "Test Provider"); + "Test Provider", + /*drop_constant_initializers*/ false); const GraphViewer partial_graph_viewer(graph, *compute_capability->sub_graph); ASSERT_EQ(3, partial_graph_viewer.NumberOfNodes()); diff --git a/onnxruntime/test/providers/cpu/tensor/space_depth_ops_test.cc b/onnxruntime/test/providers/cpu/tensor/space_depth_ops_test.cc index 5222380d9ca56..a0c1d675f506f 100644 --- a/onnxruntime/test/providers/cpu/tensor/space_depth_ops_test.cc +++ b/onnxruntime/test/providers/cpu/tensor/space_depth_ops_test.cc @@ -373,5 +373,36 @@ TEST(TensorOpTest, DepthToSpaceTest_5) { test.Run(); } +TEST(TensorOpTest, DepthToSpaceTest_CRD_Batched) { + OpTester test("DepthToSpace", 11); // create an opset 11 model with attribute present = "CRD" mode + constexpr int64_t blocksize = 2; + test.AddAttribute("blocksize", blocksize); + test.AddAttribute("mode", "CRD"); + + constexpr int64_t N = 2, C = 4, H = 2, W = 3; + std::vector X = {0., 1., 2., + 3., 4., 5., + 9., 10., 11., + 12., 13., 14., + 18., 19., 20., + 21., 22., 23., + 27., 28., 29., + 30., 31., 32.}; + + // append same data but in reverse order so we can tell if the batch output is wrong + X.insert(X.end(), X.rbegin(), X.rend()); + + test.AddInput("input", {N, C, H, W}, X); + + std::vector result = {0., 9., 1., 10., 2., 11., + 18., 27., 19., 28., 20., 29., + 3., 12., 4., 13., 5., 14., + 21., 30., 22., 31., 23., 32.}; + result.insert(result.end(), result.rbegin(), result.rend()); + + test.AddOutput("output", {2, 1, 4, 6}, result); + test.Run(); +} + } // namespace test } // namespace onnxruntime diff --git a/tools/ci_build/github/apple/coreml_supported_mlprogram_ops.md b/tools/ci_build/github/apple/coreml_supported_mlprogram_ops.md index 5609033fc3e35..d2a961f17bd6a 100644 --- a/tools/ci_build/github/apple/coreml_supported_mlprogram_ops.md +++ b/tools/ci_build/github/apple/coreml_supported_mlprogram_ops.md @@ -6,13 +6,16 @@ Keep in sync with doco generated from /docs/execution-providers/CoreML-Execution |ai.onnx:Add|| |ai.onnx:AveragePool|Only 2D Pool is supported currently. 3D and 5D support can be added if needed.| |ai.onnx:Clip|| +|ai.onnx:Concat|| |ai.onnx:Conv|Only 1D/2D Conv is supported.
Bias if provided must be constant.| |ai.onnx:ConvTranspose|Weight and bias must be constant.
padding_type of SAME_UPPER/SAME_LOWER is not supported.
kernel_shape must have default values.
output_shape is not supported.
output_padding must have default values.| +|ai.onnx.DepthToSpace|If 'mode' is 'CRD' the input must have a fixed shape.| |ai.onnx:Div|| |ai.onnx:Gemm|Input B must be constant.| |ai.onnx:GlobalAveragePool|Only 2D Pool is supported currently. 3D and 5D support can be added if needed.| |ai.onnx:GlobalMaxPool|Only 2D Pool is supported currently. 3D and 5D support can be added if needed.| |ai.onnx:GridSample|4D input.
'mode' of 'linear' or 'zeros'.
(mode==linear && padding_mode==reflection && align_corners==0) is not supported.| +|ai.onnx.LeakyRelu|| |ai.onnx:MatMul|Only support for transA == 0, alpha == 1.0 and beta == 1.0 is currently implemented.| |ai.onnx:MaxPool|Only 2D Pool is supported currently. 3D and 5D support can be added if needed.| |ai.onnx:Mul|| @@ -24,4 +27,4 @@ Keep in sync with doco generated from /docs/execution-providers/CoreML-Execution |ai.onnx:Sub|| |ai.onnx:Sigmoid|| |ai:onnx:Tanh|| -|ai:onnx:Transpose|| +|ai.onnx:Transpose|| From c464ab3acabfd276ca545db7eb364316e0158067 Mon Sep 17 00:00:00 2001 From: Justin Chu Date: Thu, 25 Jul 2024 15:57:30 -0700 Subject: [PATCH 23/57] Allow cpplint to always be green (#21491) Allow cpplint to always be green since it is optional. Also changed the workflow name to reflect that. --- .github/workflows/lint.yml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index 3965fe063b148..2edbe2d814533 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -73,7 +73,7 @@ jobs: checkout_path: ${{ github.workspace }} lint-cpp: - name: Lint C++ + name: Optional Lint C++ runs-on: ubuntu-latest steps: - uses: actions/checkout@master @@ -89,10 +89,11 @@ jobs: - name: Generate ONNX protobuf files run: cmake --build build/Debug --config Debug --target onnx_proto - uses: reviewdog/action-cpplint@master + continue-on-error: true with: github_token: ${{ secrets.github_token }} reporter: github-pr-check - level: warning + level: info flags: --linelength=120 --exclude=java/src/main/native/*.c --exclude=onnxruntime/core/mlas/inc/* From e5302b23c43b690592a818da95b4a31059e59e9e Mon Sep 17 00:00:00 2001 From: Scott McKay Date: Fri, 26 Jul 2024 10:00:28 +1000 Subject: [PATCH 24/57] Fix SkipLayerNormFusion incorrectly setting modified every time it runs (#21502) ### Description Current behavior forces all L2 optimizers to loop until they hit the max number of iterations. Only update modified if the graph was modified. ### Motivation and Context Fix unnecessary loops of L2 optimizers during model loading. --- onnxruntime/core/optimizer/skip_layer_norm_fusion.cc | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/onnxruntime/core/optimizer/skip_layer_norm_fusion.cc b/onnxruntime/core/optimizer/skip_layer_norm_fusion.cc index cf70a7d821d72..655364357999a 100644 --- a/onnxruntime/core/optimizer/skip_layer_norm_fusion.cc +++ b/onnxruntime/core/optimizer/skip_layer_norm_fusion.cc @@ -168,7 +168,8 @@ Note: This fusion doesn't consider the following case: LayerNormalization */ -Status SkipLayerNormFusion::ApplyImpl(Graph& graph, bool& modified, int graph_level, const logging::Logger& logger) const { +Status SkipLayerNormFusion::ApplyImpl(Graph& graph, bool& modified, int graph_level, + const logging::Logger& logger) const { GraphViewer graph_viewer(graph); const auto& node_topology_list = graph_viewer.GetNodesInTopologicalOrder(); InlinedVector> nodes_to_remove; @@ -299,12 +300,15 @@ Status SkipLayerNormFusion::ApplyImpl(Graph& graph, bool& modified, int graph_le // Assign provider to this new node. Provider should be same as the provider for old node. skip_layer_norm_node.SetExecutionProviderType(ln_node.GetExecutionProviderType()); } + for (const auto& node : nodes_to_remove) { graph_utils::RemoveNodeOutputEdges(graph, node); graph.RemoveNode(node.get().Index()); } - modified = true; + if (!nodes_to_remove.empty()) { + modified = true; + } return Status::OK(); } From 166809425ed3179ca66ba18383ab4664cdc33cde Mon Sep 17 00:00:00 2001 From: aamajumder <150728138+aamajumder@users.noreply.github.com> Date: Thu, 25 Jul 2024 17:06:30 -0700 Subject: [PATCH 25/57] [DML EP] Register ReduceMin-20 (#20477) ### Description This PR registers the ReduceMin-20 operator to the DML EP. ### Motivation and Context --- docs/OperatorKernels.md | 3 ++- .../src/Operators/OperatorRegistration.cpp | 1 + .../providers/dml/OperatorAuthorHelper/OperatorVersions.h | 1 + .../test/testdata/onnx_backend_test_series_filters.jsonc | 5 ++++- 4 files changed, 8 insertions(+), 2 deletions(-) diff --git a/docs/OperatorKernels.md b/docs/OperatorKernels.md index ed944b5a6df79..211c53d0fecc8 100644 --- a/docs/OperatorKernels.md +++ b/docs/OperatorKernels.md @@ -1178,7 +1178,8 @@ Do not modify directly.* |||13+|**T** = tensor(float), tensor(float16)| |||11+|**T** = tensor(float), tensor(float16)| |||1+|**T** = tensor(float), tensor(float16)| -|ReduceMin|*in* data:**T**
*in* axes:**tensor(int64)**
*out* reduced:**T**

or

*in* data:**T**
*out* reduced:**T**|18+|**T** = tensor(float), tensor(float16), tensor(int32), tensor(int64), tensor(uint32), tensor(uint64)| +|ReduceMin|*in* data:**T**
*in* axes:**tensor(int64)**
*out* reduced:**T**

or

*in* data:**T**
*out* reduced:**T**|20+|**T** = tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)| +|||18+|**T** = tensor(float), tensor(float16), tensor(int32), tensor(int64), tensor(uint32), tensor(uint64)| |||13+|**T** = tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)| |||12+|**T** = tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)| |||11+|**T** = tensor(float), tensor(float16)| diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/OperatorRegistration.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/OperatorRegistration.cpp index 27605a6ad8e8c..cf8f0a4b2db83 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/OperatorRegistration.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/Operators/OperatorRegistration.cpp @@ -977,6 +977,7 @@ constexpr static OperatorRegistrationInformation operatorRegistrationInformation {REG_INFO( 12, ReduceMin, typeNameListDefault, supportedTypeListFloat16to32Ints8to64, DmlGraphSupport::Supported)}, {REG_INFO( 13, ReduceMin, typeNameListDefault, supportedTypeListFloat16to32Ints8to64, DmlGraphSupport::Supported)}, {REG_INFO( 18, ReduceMin, typeNameListDefault, supportedTypeListFloat16to32Ints32to64, DmlGraphSupport::Supported, requiredConstantCpuInputs(1))}, + {REG_INFO( 20, ReduceMin, typeNameListDefault, supportedTypeListAllScalars, DmlGraphSupport::Supported, requiredConstantCpuInputs(1))}, {REG_INFO( 7, ArgMax, typeNameListDefault, supportedTypeListArgMinMax, DmlGraphSupport::Supported)}, {REG_INFO( 11, ArgMax, typeNameListDefault, supportedTypeListArgMinMax, DmlGraphSupport::Supported)}, {REG_INFO( 12, ArgMax, typeNameListDefault, supportedTypeListArgMinMax, DmlGraphSupport::Supported)}, diff --git a/onnxruntime/core/providers/dml/OperatorAuthorHelper/OperatorVersions.h b/onnxruntime/core/providers/dml/OperatorAuthorHelper/OperatorVersions.h index cd188761b22f7..f45c2b08db94d 100644 --- a/onnxruntime/core/providers/dml/OperatorAuthorHelper/OperatorVersions.h +++ b/onnxruntime/core/providers/dml/OperatorAuthorHelper/OperatorVersions.h @@ -434,6 +434,7 @@ namespace OperatorHelper static const int sc_sinceVer_IsNaN = 20; static const int sc_sinceVer_IsInf = 20; static const int sc_sinceVer_ReduceMax = 20; + static const int sc_sinceVer_ReduceMin = 20; } namespace MsftOperatorSet1 diff --git a/onnxruntime/test/testdata/onnx_backend_test_series_filters.jsonc b/onnxruntime/test/testdata/onnx_backend_test_series_filters.jsonc index 1885a213bdf32..4b14d50127aa9 100644 --- a/onnxruntime/test/testdata/onnx_backend_test_series_filters.jsonc +++ b/onnxruntime/test/testdata/onnx_backend_test_series_filters.jsonc @@ -720,7 +720,10 @@ "^test_constantofshape_int_zeros", "^test_reduce_log_sum_empty_set_cpu", "^test_reduce_log_sum_exp_empty_set_cpu", - "^test_reduce_prod_empty_set_cpu" + "^test_reduce_prod_empty_set_cpu", + //Bug: DML EP does not execute operators with an empty input tensor + //TODO: Resolve as a graph implementation that returns a constant inf tensor with appropriate strides + "^test_reduce_min_empty_set_cpu" ], // ORT first supported opset 7, so models with nodes that require versions prior to opset 7 are not supported "tests_with_pre_opset7_dependencies": [ From b6b29309a529b28e94787edc359f75c9a2207486 Mon Sep 17 00:00:00 2001 From: Wanming Lin Date: Fri, 26 Jul 2024 08:07:01 +0800 Subject: [PATCH 26/57] [WebNN EP] Update argMax/argMin to adapt to latest spec (#21452) WebNN spec recently changes the definition of argMax/argMin: - Remove selectLastIndex option, let backends decide to select the last index or not. - Move axes option to axis input --- js/web/docs/webnn-operators.md | 4 ++-- .../builders/impl/argmax_min_op_builder.cc | 23 +++---------------- 2 files changed, 5 insertions(+), 22 deletions(-) diff --git a/js/web/docs/webnn-operators.md b/js/web/docs/webnn-operators.md index 8d077846fa6a4..75652899b5e5e 100644 --- a/js/web/docs/webnn-operators.md +++ b/js/web/docs/webnn-operators.md @@ -13,8 +13,8 @@ operators and the supported opset domain/versions in **WebNN EP** by ONNX Runtim |:------:|:------:|:------:|:-:|:-:|:------| | Abs | ai.onnx(7-12, 13+) | abs | ✓ | ✓ | | | Add | ai.onnx(7-12, 13, 14+) | add | ✓ | ✓ | | -| ArgMax | ai.onnx(7-10, 11, 12, 13+) | argMax | ✓ | ✓ | WebNN CPU backend only supports 'select_last_index' value is 0 | -| ArgMin | ai.onnx(7-10, 11, 12, 13+) | argMin | ✓ | ✓ | WebNN CPU backend only supports 'select_last_index' value is 0 | +| ArgMax | ai.onnx(7-10, 11, 12, 13+) | argMax | ✓ | ✓ | | +| ArgMin | ai.onnx(7-10, 11, 12, 13+) | argMin | ✓ | ✓ | | | AveragePool | ai.onnx(7-9, 10, 11, 12-18, 19+) | averagePool2d | ✓ | ✓ | Only supports 4-D input, 2-D 'kernel_shape', 'count_include_pad' value is 0 | | BatchNormalization | ai.onnx(7-8, 9-13, 14, 15+) | batchNormalization | ✓ | ✓ | Only supports 'training_mode' value is 0, one output | | Cast | ai.onnx(7-8, 9-12, 13-18, 19-20, 21+) | cast | ✓ | ✓ | WebNN CPU backend doesn't support casting to uint64 data type | diff --git a/onnxruntime/core/providers/webnn/builders/impl/argmax_min_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/argmax_min_op_builder.cc index 1330a3e354871..1ae63a644a287 100644 --- a/onnxruntime/core/providers/webnn/builders/impl/argmax_min_op_builder.cc +++ b/onnxruntime/core/providers/webnn/builders/impl/argmax_min_op_builder.cc @@ -40,28 +40,20 @@ Status ArgMaxMinOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, NodeAttrHelper helper(node); int64_t axis = helper.Get("axis", 0); const auto keep_dims = helper.Get("keepdims", 1); - const auto select_last_index = helper.Get("select_last_index", 0); axis = HandleNegativeAxis(axis, input_rank); - emscripten::val axes = emscripten::val::array(); - axes.call("push", static_cast(axis)); emscripten::val options = emscripten::val::object(); - options.set("axes", axes); options.set("keepDimensions", keep_dims == 1); - options.set("selectLastIndex", select_last_index == 1); - // TODO: use WebNN's opSupportLimits API to check the backend's supported output data types. - // If the backend doesn't support int64 output, we should use default int32 output data type - // then do a type casting (int32 -> int64) for the output. Refer to the CoreML EP for how to - // support int64 output. + // TODO(Honry): check whether int64 output data type is supported by WebNN opSupportLimits() API. options.set("outputDataType", "int64"); emscripten::val output = emscripten::val::object(); const auto& op_type = node.OpType(); if (op_type == "ArgMax") { - output = model_builder.GetBuilder().call("argMax", input, options); + output = model_builder.GetBuilder().call("argMax", input, narrow(axis), options); } else if (op_type == "ArgMin") { - output = model_builder.GetBuilder().call("argMin", input, options); + output = model_builder.GetBuilder().call("argMin", input, narrow(axis), options); } else { return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "ArgMaxMinOpBuilder, unknown op: ", op_type); } @@ -81,15 +73,6 @@ bool ArgMaxMinOpBuilder::IsOpSupportedImpl(const InitializedTensorSet& /* initia if (!GetShape(*input_defs[0], input_shape, logger)) return false; - // WebNN CPU backend only supports select_last_index = 0. - if (device_type == WebnnDeviceType::CPU) { - NodeAttrHelper helper(node); - const auto select_last_index = helper.Get("select_last_index", 0); - if (select_last_index) { - LOGS(logger, VERBOSE) << "ArgMax/ArgMin with select_last_index = 1 is not supported on WebNN CPU backend."; - return false; - } - } return true; } From 278f0f5cd2ab18abb48f381fc3fee127ebe960d2 Mon Sep 17 00:00:00 2001 From: Prathik Rao Date: Fri, 26 Jul 2024 02:23:35 -0700 Subject: [PATCH 27/57] disables qnn in ort training cpu pipeline (#21510) ### Description `enable_windows_arm64_qnn` and `enable_windows_x64_qnn` are true by default but unnecessary for training. This change explicitly sets these parameters to false for training pipeline. ### Motivation and Context ORT 1.19 Release Preparation --- .../azure-pipelines/orttraining-py-packaging-pipeline-cpu.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/ci_build/github/azure-pipelines/orttraining-py-packaging-pipeline-cpu.yml b/tools/ci_build/github/azure-pipelines/orttraining-py-packaging-pipeline-cpu.yml index 5fa80bf7ff6d5..1fa88318b8c03 100644 --- a/tools/ci_build/github/azure-pipelines/orttraining-py-packaging-pipeline-cpu.yml +++ b/tools/ci_build/github/azure-pipelines/orttraining-py-packaging-pipeline-cpu.yml @@ -22,3 +22,5 @@ stages: enable_windows_gpu: false enable_mac_cpu: true enable_linux_arm: false + enable_windows_arm64_qnn: false + enable_windows_x64_qnn: false From bbbaef3fa60f1ef1920af5104459251b42390135 Mon Sep 17 00:00:00 2001 From: Justin Chu Date: Fri, 26 Jul 2024 08:46:54 -0700 Subject: [PATCH 28/57] Update text formatting in generate_cgmanifest.py (#21489) The only place where I manually fixed I forgot a format string --- cgmanifests/generate_cgmanifest.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cgmanifests/generate_cgmanifest.py b/cgmanifests/generate_cgmanifest.py index 52bd3f58645f2..b2e8f6816a2ef 100644 --- a/cgmanifests/generate_cgmanifest.py +++ b/cgmanifests/generate_cgmanifest.py @@ -73,7 +73,7 @@ def add_github_dep(name, parsed_url): return # Make a REST call to convert to tag to a git commit url = f"https://api.github.com/repos/{org_name}/{repo_name}/git/refs/tags/{tag}" - print("requesting {url} ...") + print(f"requesting {url} ...") res = requests.get(url, auth=(args.username, args.token)) response_json = res.json() tag_object = response_json["object"] From 7db7c4e5c80eeecd75dd66a9fa691ac32c3a8a98 Mon Sep 17 00:00:00 2001 From: Jian Chen Date: Fri, 26 Jul 2024 14:54:45 -0700 Subject: [PATCH 29/57] Separating all GPU stages into different Pipelines (#21521) ### Description Separating all GPU stages into different Pipelines --- .../win-gpu-cuda-ci-pipeline.yml | 64 +++++++++++++++++++ .../win-gpu-dml-ci-pipeline.yml | 52 +++++++++++++++ .../win-gpu-doc-gen-ci-pipeline.yml | 61 ++++++++++++++++++ .../win-gpu-training-ci-pipeline.yml | 63 ++++++++++++++++++ tools/ci_build/set-trigger-rules.py | 5 +- 5 files changed, 244 insertions(+), 1 deletion(-) create mode 100644 tools/ci_build/github/azure-pipelines/win-gpu-cuda-ci-pipeline.yml create mode 100644 tools/ci_build/github/azure-pipelines/win-gpu-dml-ci-pipeline.yml create mode 100644 tools/ci_build/github/azure-pipelines/win-gpu-doc-gen-ci-pipeline.yml create mode 100644 tools/ci_build/github/azure-pipelines/win-gpu-training-ci-pipeline.yml diff --git a/tools/ci_build/github/azure-pipelines/win-gpu-cuda-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/win-gpu-cuda-ci-pipeline.yml new file mode 100644 index 0000000000000..78e1624b5d123 --- /dev/null +++ b/tools/ci_build/github/azure-pipelines/win-gpu-cuda-ci-pipeline.yml @@ -0,0 +1,64 @@ +##### start trigger Don't edit it manually, Please do edit set-trigger-rules.py #### +trigger: + branches: + include: + - main + - rel-* + paths: + exclude: + - docs/** + - README.md + - CONTRIBUTING.md + - BUILD.md + - 'js/web' + - 'onnxruntime/core/providers/js' +pr: + branches: + include: + - main + - rel-* + paths: + exclude: + - docs/** + - README.md + - CONTRIBUTING.md + - BUILD.md + - 'js/web' + - 'onnxruntime/core/providers/js' +#### end trigger #### + +parameters: +- name: CudaVersion + displayName: CUDA version + type: string + default: '12.2' + values: + - 11.8 + - 12.2 +- name: RunOnnxRuntimeTests + displayName: Run Tests? + type: boolean + default: true + +stages: +- stage: cuda + dependsOn: [] + jobs: + - template: templates/jobs/win-ci-vs-2022-job.yml + parameters: + BuildConfig: 'RelWithDebInfo' + EnvSetupScript: setup_env_cuda.bat + buildArch: x64 + additionalBuildFlags: >- + --enable_pybind --build_java --build_nodejs --use_cuda --cuda_home="$(Agent.TempDirectory)\v${{ parameters.CudaVersion }}" + --enable_cuda_profiling --enable_transformers_tool_test + --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86 + --cmake_extra_defines onnxruntime_BUILD_UNIT_TESTS=ON + --cmake_extra_defines onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS=ON + msbuildPlatform: x64 + isX86: false + job_name_suffix: x64_RelWithDebInfo + RunOnnxRuntimeTests: ${{ parameters.RunOnnxRuntimeTests }} + ORT_EP_NAME: CUDA + WITH_CACHE: true + MachinePool: onnxruntime-Win2022-GPU-A10 \ No newline at end of file diff --git a/tools/ci_build/github/azure-pipelines/win-gpu-dml-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/win-gpu-dml-ci-pipeline.yml new file mode 100644 index 0000000000000..904979f39ca31 --- /dev/null +++ b/tools/ci_build/github/azure-pipelines/win-gpu-dml-ci-pipeline.yml @@ -0,0 +1,52 @@ +##### start trigger Don't edit it manually, Please do edit set-trigger-rules.py #### +trigger: + branches: + include: + - main + - rel-* + paths: + exclude: + - docs/** + - README.md + - CONTRIBUTING.md + - BUILD.md + - 'js/web' + - 'onnxruntime/core/providers/js' +pr: + branches: + include: + - main + - rel-* + paths: + exclude: + - docs/** + - README.md + - CONTRIBUTING.md + - BUILD.md + - 'js/web' + - 'onnxruntime/core/providers/js' +#### end trigger #### + +parameters: +- name: RunOnnxRuntimeTests + displayName: Run Tests? + type: boolean + default: true + +stages: +- stage: dml + dependsOn: [] + jobs: + - template: templates/jobs/win-ci-vs-2022-job.yml + parameters: + BuildConfig: 'RelWithDebInfo' + EnvSetupScript: setup_env.bat + buildArch: x64 + additionalBuildFlags: --enable_pybind --use_dml --enable_wcos --use_winml + msbuildPlatform: x64 + isX86: false + job_name_suffix: x64_RelWithDebInfo + RunOnnxRuntimeTests: ${{ parameters.RunOnnxRuntimeTests }} + ORT_EP_NAME: DML + WITH_CACHE: false + MachinePool: onnxruntime-Win2022-GPU-dml-A10 \ No newline at end of file diff --git a/tools/ci_build/github/azure-pipelines/win-gpu-doc-gen-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/win-gpu-doc-gen-ci-pipeline.yml new file mode 100644 index 0000000000000..4106889331350 --- /dev/null +++ b/tools/ci_build/github/azure-pipelines/win-gpu-doc-gen-ci-pipeline.yml @@ -0,0 +1,61 @@ +##### start trigger Don't edit it manually, Please do edit set-trigger-rules.py #### +trigger: + branches: + include: + - main + - rel-* + paths: + exclude: + - docs/** + - README.md + - CONTRIBUTING.md + - BUILD.md + - 'js/web' + - 'onnxruntime/core/providers/js' +pr: + branches: + include: + - main + - rel-* + paths: + exclude: + - docs/** + - README.md + - CONTRIBUTING.md + - BUILD.md + - 'js/web' + - 'onnxruntime/core/providers/js' +#### end trigger #### + +parameters: +- name: CudaVersion + displayName: CUDA version + type: string + default: '12.2' + values: + - 11.8 + - 12.2 + +stages: +- stage: kernelDocumentation + dependsOn: [] + jobs: + - template: templates/jobs/win-ci-vs-2022-job.yml + parameters: + BuildConfig: 'RelWithDebInfo' + EnvSetupScript: setup_env_cuda.bat + buildArch: x64 + # note: need to specify `--gen_doc` when creating the build config so it has to be in additionalBuildFlags + additionalBuildFlags: >- + --gen_doc validate --skip_tests --enable_pybind --use_dml --use_cuda + --cuda_home="$(Agent.TempDirectory)\v${{ parameters.CudaVersion }}" + --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86 + --cmake_extra_defines onnxruntime_BUILD_UNIT_TESTS=OFF + msbuildPlatform: x64 + isX86: false + job_name_suffix: x64_RelWithDebInfo + RunOnnxRuntimeTests: false + GenerateDocumentation: true + ORT_EP_NAME: CUDA # It doesn't really matter which EP is selected here since this stage is for documentation. + WITH_CACHE: true + MachinePool: onnxruntime-Win2022-GPU-A10 diff --git a/tools/ci_build/github/azure-pipelines/win-gpu-training-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/win-gpu-training-ci-pipeline.yml new file mode 100644 index 0000000000000..3bb6c267f0018 --- /dev/null +++ b/tools/ci_build/github/azure-pipelines/win-gpu-training-ci-pipeline.yml @@ -0,0 +1,63 @@ +##### start trigger Don't edit it manually, Please do edit set-trigger-rules.py #### +trigger: + branches: + include: + - main + - rel-* + paths: + exclude: + - docs/** + - README.md + - CONTRIBUTING.md + - BUILD.md + - 'js/web' + - 'onnxruntime/core/providers/js' +pr: + branches: + include: + - main + - rel-* + paths: + exclude: + - docs/** + - README.md + - CONTRIBUTING.md + - BUILD.md + - 'js/web' + - 'onnxruntime/core/providers/js' +#### end trigger #### + +parameters: +- name: CudaVersion + displayName: CUDA version + type: string + default: '12.2' + values: + - 11.8 + - 12.2 +- name: RunOnnxRuntimeTests + displayName: Run Tests? + type: boolean + default: true + +stages: +- stage: training + dependsOn: [] + jobs: + - template: templates/jobs/win-ci-vs-2022-job.yml + parameters: + BuildConfig: 'RelWithDebInfo' + EnvSetupScript: setup_env_cuda.bat + buildArch: x64 + additionalBuildFlags: >- + --enable_pybind --enable_training --use_cuda --cuda_home="$(Agent.TempDirectory)\v${{ parameters.CudaVersion }}" + --skip_onnx_tests + --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86 + msbuildPlatform: x64 + isX86: false + job_name_suffix: x64_RelWithDebInfo + RunOnnxRuntimeTests: ${{ parameters.RunOnnxRuntimeTests }} + ORT_EP_NAME: CUDA + WITH_CACHE: true + MachinePool: onnxruntime-Win2022-GPU-A10 + isTraining: true diff --git a/tools/ci_build/set-trigger-rules.py b/tools/ci_build/set-trigger-rules.py index d26fec41033ca..0d90061e9c687 100644 --- a/tools/ci_build/set-trigger-rules.py +++ b/tools/ci_build/set-trigger-rules.py @@ -34,7 +34,10 @@ "orttraining-linux-gpu-ortmodule-distributed-test-ci-pipeline.yml", "orttraining-mac-ci-pipeline.yml", "win-ci-pipeline.yml", - "win-gpu-ci-pipeline.yml", + "win-gpu-ci-dml-pipeline.yml", + "win-gpu-ci-cuda-pipeline.yml", + "win-gpu-ci-training-pipeline.yml", + "win-gpu-ci-doc-gen-pipeline.yml", "win-gpu-tensorrt-ci-pipeline.yml", "win-qnn-arm64-ci-pipeline.yml", "win-qnn-ci-pipeline.yml", From fb61e14153b6a1263c15ea3b62d6bbbc5bde9848 Mon Sep 17 00:00:00 2001 From: Hector Li Date: Fri, 26 Jul 2024 16:56:44 -0700 Subject: [PATCH 30/57] Add QNN EP option context_node_name_prefix to set EPContext node name prefix (#21236) ### Description Add QNN EP option context_node_name_prefix to set EPContext node name prefix ### Motivation and Context For the case to workaround QNN context PD memory limit, user need split the model into pieces and generate the QNN context model separately. It could happen that the generated EPContext node in separate graph has same node name. This will cause issue if glue those EPContext nodes together into a single model. To avoid this user can set this context_node_name_prefix for each split pieces to make the node name unique. --- .../onnxruntime_session_options_config_keys.h | 4 ++ .../providers/qnn/qnn_execution_provider.cc | 9 ++++- .../providers/qnn/qnn_execution_provider.h | 1 + .../test/providers/qnn/qnn_ep_context_test.cc | 39 +++++++++++++++++++ 4 files changed, 52 insertions(+), 1 deletion(-) diff --git a/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h b/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h index 17ae649e6f174..209fd4279cc99 100644 --- a/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h +++ b/include/onnxruntime/core/session/onnxruntime_session_options_config_keys.h @@ -265,6 +265,10 @@ static const char* const kOrtSessionOptionEpContextFilePath = "ep.context_file_p // "1": dump the EP context into the Onnx model. (default). static const char* const kOrtSessionOptionEpContextEmbedMode = "ep.context_embed_mode"; +// Specify the EPContext node name prefix to make it unique +// in case user need to merge/connect multiple EPContext nodes in one model +static const char* const kOrtSessionOptionEpContextNodeNamePrefix = "ep.context_node_name_prefix"; + // Gemm fastmath mode provides fp32 gemm acceleration with bfloat16 based matmul. // Option values: // - "0": Gemm FastMath mode is not enabled. [DEFAULT] diff --git a/onnxruntime/core/providers/qnn/qnn_execution_provider.cc b/onnxruntime/core/providers/qnn/qnn_execution_provider.cc index 539b456cb657f..c56a47e67497e 100644 --- a/onnxruntime/core/providers/qnn/qnn_execution_provider.cc +++ b/onnxruntime/core/providers/qnn/qnn_execution_provider.cc @@ -199,6 +199,13 @@ QNNExecutionProvider::QNNExecutionProvider(const ProviderOptions& provider_optio context_cache_path_cfg_ = session_options->config_options.GetConfigOrDefault(kOrtSessionOptionEpContextFilePath, ""); LOGS_DEFAULT(VERBOSE) << "User specified context cache path: " << context_cache_path_cfg_; + + // For the case that workaround QNN context PD memory limit, user need split the model into pieces and + // generate the QNN context model separately. + // It could happen that the generated EPContext node in separate graph has same node name. + // User can set this context_node_name_prefix for each split pieces to avoid that happens. + context_node_name_prefix_ = session_options->config_options.GetConfigOrDefault(kOrtSessionOptionEpContextNodeNamePrefix, ""); + LOGS_DEFAULT(VERBOSE) << "User specified QNN context node name prefix: " << context_node_name_prefix_; } static const std::string BACKEND_PATH = "backend_path"; @@ -613,7 +620,7 @@ QNNExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph_viewer const auto gen_metadef_name = [&]() { uint64_t model_hash; int metadef_id = metadef_id_generator_.GenerateId(graph_viewer, model_hash); - return MakeString(QNN, "_", model_hash, "_", metadef_id); + return MakeString(QNN, context_node_name_prefix_, "_", model_hash, "_", metadef_id); }; // For model with EPContext, make sure each partition only has one single EPContext node diff --git a/onnxruntime/core/providers/qnn/qnn_execution_provider.h b/onnxruntime/core/providers/qnn/qnn_execution_provider.h index e7419dabb14d1..f00ffb6cfdb96 100644 --- a/onnxruntime/core/providers/qnn/qnn_execution_provider.h +++ b/onnxruntime/core/providers/qnn/qnn_execution_provider.h @@ -80,6 +80,7 @@ class QNNExecutionProvider : public IExecutionProvider { std::unordered_map> qnn_models_; bool context_cache_enabled_ = false; std::string context_cache_path_cfg_ = ""; + std::string context_node_name_prefix_ = ""; bool disable_cpu_ep_fallback_ = false; // True if CPU EP fallback has been disabled for this session. bool qnn_context_embed_mode_ = true; int32_t vtcm_size_in_mb_ = 0; diff --git a/onnxruntime/test/providers/qnn/qnn_ep_context_test.cc b/onnxruntime/test/providers/qnn/qnn_ep_context_test.cc index a3768cb98f584..be3bd2cc5dcd7 100644 --- a/onnxruntime/test/providers/qnn/qnn_ep_context_test.cc +++ b/onnxruntime/test/providers/qnn/qnn_ep_context_test.cc @@ -279,6 +279,45 @@ TEST_F(QnnHTPBackendTests, QnnContextGeneration2InputsOrderIssue) { ASSERT_EQ(std::remove(context_binary_file.c_str()), 0); } +TEST_F(QnnHTPBackendTests, QnnContextGenerationNodeNamePrefix) { + ProviderOptions provider_options; +#if defined(_WIN32) + provider_options["backend_path"] = "QnnHtp.dll"; +#else + provider_options["backend_path"] = "libQnnHtp.so"; +#endif + std::string node_name_prefix = "node_name_prefix_test"; + + // Add kMSDomain to cover contrib op like Gelu + const std::unordered_map domain_to_version = {{"", 13}, {kMSDomain, 1}}; + + auto& logging_manager = DefaultLoggingManager(); + logging_manager.SetDefaultLoggerSeverity(logging::Severity::kERROR); + + const std::string context_binary_file = "./qnn_ctx_2_inputs_order_test_gen.onnx"; + Ort::SessionOptions so; + so.AddConfigEntry(kOrtSessionOptionEpContextEnable, "1"); + so.AddConfigEntry(kOrtSessionOptionEpContextFilePath, context_binary_file.c_str()); + so.AddConfigEntry(kOrtSessionOptionEpContextNodeNamePrefix, node_name_prefix.c_str()); + so.AppendExecutionProvider("QNN", provider_options); + + Ort::Session session(*ort_env, ORT_TSTR("testdata/qnn_ctx_2_inputs_order_test.onnx"), so); + + // Make sure the Qnn context cache binary file is generated + EXPECT_TRUE(std::filesystem::exists(context_binary_file.c_str())); + + std::shared_ptr model; + ASSERT_STATUS_OK(Model::Load(ToPathString(context_binary_file), model, nullptr, DefaultLoggingManager().DefaultLogger())); + for (auto& node : model->MainGraph().Nodes()) { + if (node.OpType() == "EPContext") { + EXPECT_TRUE(node.Name().find(node_name_prefix) != std::string::npos); + } + } + + // clean up + ASSERT_EQ(std::remove(context_binary_file.c_str()), 0); +} + // Run QDQ model on HTP 3 times // 1st run will generate the Qnn context cache onnx file // 2nd run directly loads and run from Qnn context cache model From 64819f6f8cad8387b23d7cc8af1a4b4207e2dfbb Mon Sep 17 00:00:00 2001 From: Tianlei Wu Date: Fri, 26 Jul 2024 18:45:14 -0700 Subject: [PATCH 31/57] Update benchmark_mha.py to compare with PyTorch SDPA (#21449) ### Description * Update benchmark_mha.py to compare with PyTorch SDPA api. * Write results to csv file. * Use sdpa_kernel cuda provider option instead of environment variables for better control. * Add arguments (`--use_gpu`, `--causal` etc) to allow testing different senarios. * Update benchmark_mha.sh to add cpu benchmarks For Q,K,V format, torch uses BNSH format, while ort uses BSNH format, so the result is not apple-to-apple. However, if the latency difference is large, that could be a warning. #### Example GPU results Example results on A100-SXM4-80GB with settings (use_gpu=TRUE, enable_cuda_graph=FALSE, causal=FALSE, past_sequence_length=0, intra_op_num_threads=0) in Azure Linux. ORT: build from source with CUDA 12.5; PyTorch 2.3.1 for cuda 12.1. format | batch_size | sequence_length | num_heads | head_size | latency (s) | tflops | kernel -- | -- | -- | -- | -- | -- | -- | -- Q,KV | 4 | 2048 | 32 | 128 | 0.0015 | 179.5 | ort:flash Q,KV | 4 | 2048 | 32 | 128 | 0.0015 | 179.0 | ort:default Q,K,V | 4 | 2048 | 32 | 128 | 0.0016 | 170.0 | ort:default Q,K,V | 4 | 2048 | 32 | 128 | 0.0016 | 169.5 | ort:flash QKV | 4 | 2048 | 32 | 128 | 0.0016 | 168.5 | ort:default QKV | 4 | 2048 | 32 | 128 | 0.0016 | 167.4 | ort:flash Q,K,V | 4 | 2048 | 32 | 128 | 0.0017 | 159.4 | torch:default Q,K,V | 4 | 2048 | 32 | 128 | 0.0018 | 155.0 | torch:flash Q,KV | 4 | 2048 | 32 | 128 | 0.0030 | 92.7 | ort:efficient Q,K,V | 4 | 2048 | 32 | 128 | 0.0030 | 90.9 | ort:efficient QKV | 4 | 2048 | 32 | 128 | 0.0031 | 89.9 | ort:efficient Q,K,V | 4 | 2048 | 32 | 128 | 0.0031 | 89.0 | torch:efficient Q,K,V | 4 | 2048 | 32 | 128 | 0.0054 | 51.3 | torch:math Q,KV | 4 | 4096 | 32 | 128 | 0.0058 | 191.0 | ort:default Q,KV | 4 | 4096 | 32 | 128 | 0.0058 | 190.6 | ort:flash Q,K,V | 4 | 4096 | 32 | 128 | 0.0059 | 187.8 | ort:default Q,K,V | 4 | 4096 | 32 | 128 | 0.0059 | 186.7 | ort:flash QKV | 4 | 4096 | 32 | 128 | 0.0059 | 185.9 | ort:flash QKV | 4 | 4096 | 32 | 128 | 0.0059 | 185.8 | ort:default Q,K,V | 4 | 4096 | 32 | 128 | 0.0067 | 163.4 | torch:default Q,K,V | 4 | 4096 | 32 | 128 | 0.0070 | 157.2 | torch:flash Q,KV | 4 | 4096 | 32 | 128 | 0.0113 | 97.6 | ort:efficient Q,K,V | 4 | 4096 | 32 | 128 | 0.0114 | 96.4 | ort:efficient QKV | 4 | 4096 | 32 | 128 | 0.0114 | 96.2 | ort:efficient Q,K,V | 4 | 4096 | 32 | 128 | 0.0127 | 86.3 | torch:efficient Q,KV | 8 | 2048 | 32 | 128 | 0.0031 | 177.8 | ort:flash Q,KV | 8 | 2048 | 32 | 128 | 0.0031 | 177.7 | ort:default Q,K,V | 8 | 2048 | 32 | 128 | 0.0032 | 170.8 | ort:default Q,K,V | 8 | 2048 | 32 | 128 | 0.0032 | 170.3 | ort:flash QKV | 8 | 2048 | 32 | 128 | 0.0032 | 169.2 | ort:default QKV | 8 | 2048 | 32 | 128 | 0.0033 | 169.0 | ort:flash Q,K,V | 8 | 2048 | 32 | 128 | 0.0034 | 161.9 | torch:default Q,K,V | 8 | 2048 | 32 | 128 | 0.0036 | 152.9 | torch:flash Q,KV | 8 | 2048 | 32 | 128 | 0.0059 | 93.5 | ort:efficient Q,K,V | 8 | 2048 | 32 | 128 | 0.0060 | 91.3 | ort:efficient QKV | 8 | 2048 | 32 | 128 | 0.0060 | 91.0 | ort:efficient Q,K,V | 8 | 2048 | 32 | 128 | 0.0064 | 86.0 | torch:efficient Q,KV | 8 | 4096 | 32 | 128 | 0.0115 | 190.8 | ort:flash Q,KV | 8 | 4096 | 32 | 128 | 0.0115 | 190.7 | ort:default Q,K,V | 8 | 4096 | 32 | 128 | 0.0118 | 187.1 | ort:default Q,K,V | 8 | 4096 | 32 | 128 | 0.0118 | 187.0 | ort:flash QKV | 8 | 4096 | 32 | 128 | 0.0118 | 185.6 | ort:default QKV | 8 | 4096 | 32 | 128 | 0.0118 | 185.6 | ort:flash Q,K,V | 8 | 4096 | 32 | 128 | 0.0139 | 158.7 | torch:default Q,K,V | 8 | 4096 | 32 | 128 | 0.0139 | 158.3 | torch:flash Q,KV | 8 | 4096 | 32 | 128 | 0.0225 | 97.7 | ort:efficient Q,K,V | 8 | 4096 | 32 | 128 | 0.0227 | 96.8 | ort:efficient QKV | 8 | 4096 | 32 | 128 | 0.0228 | 96.3 | ort:efficient Q,K,V | 8 | 4096 | 32 | 128 | 0.0260 | 84.5 | torch:efficient #### Example CPU results Dell XPS 8960 with i9-13900 CPU (use_gpu=FALSE, causal=FALSE, past_sequence_length=0) in Windows. ORT: build from source with CUDA 12.5; PyTorch 2.3.1 for cuda 12.1. format | causal | batch_size | seq_len | num_heads | head_size | threads | latency (s) | kernel -- | -- | -- | -- | -- | -- | -- | -- | -- Q,K,V | FALSE | 1 | 128 | 32 | 128 | 8 | 0.0005 | ort:flash Q,K,V | FALSE | 1 | 128 | 32 | 128 | 0 | 0.0009 | ort:flash Q,K,V | FALSE | 1 | 128 | 32 | 128 | 0 | 0.0009 | ort:math Q,K,V | FALSE | 1 | 128 | 32 | 128 | 4 | 0.0009 | ort:flash Q,K,V | FALSE | 1 | 128 | 32 | 128 | 2 | 0.0014 | ort:flash Q,K,V | FALSE | 1 | 128 | 32 | 128 | 1 | 0.0025 | ort:flash Q,K,V | FALSE | 1 | 128 | 32 | 128 | 2 | 0.0045 | torch:default Q,K,V | FALSE | 1 | 128 | 32 | 128 | 24 | 0.0046 | torch:default Q,K,V | FALSE | 1 | 128 | 32 | 128 | 8 | 0.0046 | torch:default Q,K,V | FALSE | 1 | 128 | 32 | 128 | 4 | 0.0046 | torch:default Q,K,V | FALSE | 1 | 128 | 32 | 128 | 1 | 0.0047 | torch:default Q,K,V | FALSE | 1 | 256 | 32 | 128 | 0 | 0.0019 | ort:flash Q,K,V | FALSE | 1 | 256 | 32 | 128 | 8 | 0.0019 | ort:flash Q,K,V | FALSE | 1 | 256 | 32 | 128 | 0 | 0.0022 | ort:math Q,K,V | FALSE | 1 | 256 | 32 | 128 | 4 | 0.0030 | ort:flash Q,K,V | FALSE | 1 | 256 | 32 | 128 | 2 | 0.0047 | ort:flash Q,K,V | FALSE | 1 | 256 | 32 | 128 | 1 | 0.0086 | ort:flash Q,K,V | FALSE | 1 | 256 | 32 | 128 | 2 | 0.0161 | torch:default Q,K,V | FALSE | 1 | 256 | 32 | 128 | 4 | 0.0162 | torch:default Q,K,V | FALSE | 1 | 256 | 32 | 128 | 8 | 0.0162 | torch:default Q,K,V | FALSE | 1 | 256 | 32 | 128 | 24 | 0.0165 | torch:default Q,K,V | FALSE | 1 | 256 | 32 | 128 | 1 | 0.0166 | torch:default Q,K,V | FALSE | 1 | 512 | 32 | 128 | 8 | 0.0077 | ort:flash Q,K,V | FALSE | 1 | 512 | 32 | 128 | 0 | 0.0091 | ort:flash Q,K,V | FALSE | 1 | 512 | 32 | 128 | 0 | 0.0099 | ort:math Q,K,V | FALSE | 1 | 512 | 32 | 128 | 4 | 0.0103 | ort:flash Q,K,V | FALSE | 1 | 512 | 32 | 128 | 2 | 0.0177 | ort:flash Q,K,V | FALSE | 1 | 512 | 32 | 128 | 1 | 0.0328 | ort:flash Q,K,V | FALSE | 1 | 512 | 32 | 128 | 2 | 0.0624 | torch:default Q,K,V | FALSE | 1 | 512 | 32 | 128 | 4 | 0.0624 | torch:default Q,K,V | FALSE | 1 | 512 | 32 | 128 | 8 | 0.0625 | torch:default Q,K,V | FALSE | 1 | 512 | 32 | 128 | 24 | 0.0626 | torch:default Q,K,V | FALSE | 1 | 512 | 32 | 128 | 1 | 0.0640 | torch:default Q,K,V | FALSE | 1 | 1024 | 32 | 128 | 8 | 0.0286 | ort:flash Q,K,V | FALSE | 1 | 1024 | 32 | 128 | 0 | 0.0317 | ort:flash Q,K,V | FALSE | 1 | 1024 | 32 | 128 | 4 | 0.0367 | ort:flash Q,K,V | FALSE | 1 | 1024 | 32 | 128 | 0 | 0.0391 | ort:math Q,K,V | FALSE | 1 | 1024 | 32 | 128 | 2 | 0.0656 | ort:flash Q,K,V | FALSE | 1 | 1024 | 32 | 128 | 1 | 0.1235 | ort:flash Q,K,V | FALSE | 1 | 1024 | 32 | 128 | 24 | 0.2482 | torch:default Q,K,V | FALSE | 1 | 1024 | 32 | 128 | 2 | 0.2483 | torch:default Q,K,V | FALSE | 1 | 1024 | 32 | 128 | 4 | 0.2483 | torch:default Q,K,V | FALSE | 1 | 1024 | 32 | 128 | 8 | 0.2486 | torch:default Q,K,V | FALSE | 1 | 1024 | 32 | 128 | 1 | 0.2538 | torch:default Q,K,V | FALSE | 1 | 2048 | 32 | 128 | 0 | 0.1038 | ort:flash Q,K,V | FALSE | 1 | 2048 | 32 | 128 | 8 | 0.1050 | ort:flash Q,K,V | FALSE | 1 | 2048 | 32 | 128 | 0 | 0.1368 | ort:math Q,K,V | FALSE | 1 | 2048 | 32 | 128 | 4 | 0.1535 | ort:flash Q,K,V | FALSE | 1 | 2048 | 32 | 128 | 2 | 0.2461 | ort:flash Q,K,V | FALSE | 1 | 2048 | 32 | 128 | 1 | 0.4724 | ort:flash Q,K,V | FALSE | 1 | 2048 | 32 | 128 | 8 | 0.9835 | torch:default Q,K,V | FALSE | 1 | 2048 | 32 | 128 | 4 | 0.9841 | torch:default Q,K,V | FALSE | 1 | 2048 | 32 | 128 | 24 | 0.9841 | torch:default Q,K,V | FALSE | 1 | 2048 | 32 | 128 | 2 | 0.9873 | torch:default Q,K,V | FALSE | 1 | 2048 | 32 | 128 | 1 | 0.9985 | torch:default ### Motivation and Context To compare with PyTorch SDPA on CPU and CUDA latency. --- .../python/transformers/benchmark_mha.cmd | 47 ++ .../test/python/transformers/benchmark_mha.py | 690 +++++++++++++----- .../test/python/transformers/benchmark_mha.sh | 48 +- .../test/python/transformers/test_mha.py | 46 +- 4 files changed, 609 insertions(+), 222 deletions(-) create mode 100644 onnxruntime/test/python/transformers/benchmark_mha.cmd diff --git a/onnxruntime/test/python/transformers/benchmark_mha.cmd b/onnxruntime/test/python/transformers/benchmark_mha.cmd new file mode 100644 index 0000000000000..0a6d0c37b4a35 --- /dev/null +++ b/onnxruntime/test/python/transformers/benchmark_mha.cmd @@ -0,0 +1,47 @@ +echo "Benchmark Scaled Dot Product Attention (SDPA) performance on GPU:" + +set CUDA_VISIBLE_DEVICES=0 +python benchmark_mha.py --use_gpu +python benchmark_mha.py --use_gpu --use_cuda_graph +python benchmark_mha.py --use_gpu --torch + +type benchmark_mha_gpu_*.csv > mha_gpu_benchmark_results.csv + +echo "Benchmark performance on CPU with number of threads:" +set MKL_DYNAMIC=FALSE +set OMP_NUM_THREADS=1 +python benchmark_mha.py --torch + +set OMP_NUM_THREADS=2 +python benchmark_mha.py --torch + +set OMP_NUM_THREADS=4 +python benchmark_mha.py --torch + +set OMP_NUM_THREADS=8 +python benchmark_mha.py --torch + +set MKL_DYNAMIC= +set OMP_NUM_THREADS= + +set ORT_DISABLE_FLASH_ATTENTION=0 +python benchmark_mha.py --intra_op_num_threads 1 +python benchmark_mha.py --intra_op_num_threads 2 +python benchmark_mha.py --intra_op_num_threads 4 +python benchmark_mha.py --intra_op_num_threads 8 + +echo "Benchmark performance on CPU with default threads settings:" +python benchmark_mha.py + +python benchmark_mha.py --torch + +python benchmark_mha.py --causal +python benchmark_mha.py --torch --causal + +python benchmark_mha.py --causal --has_past + +set ORT_DISABLE_FLASH_ATTENTION=1 +python benchmark_mha.py +set ORT_DISABLE_FLASH_ATTENTION= + +type benchmark_mha_cpu_*.csv > mha_cpu_benchmark_results.csv diff --git a/onnxruntime/test/python/transformers/benchmark_mha.py b/onnxruntime/test/python/transformers/benchmark_mha.py index 111c417479d20..715a92431e6bf 100644 --- a/onnxruntime/test/python/transformers/benchmark_mha.py +++ b/onnxruntime/test/python/transformers/benchmark_mha.py @@ -4,21 +4,35 @@ # -------------------------------------------------------------------------- """ -Benchmark performance of MultiHeadAttention with Nvidia GPU of Compute Capability 8.0, 8.6 or 8.9 in Linux: -sh benchmark_mha.sh +Benchmark performance of MultiHeadAttention with ORT or PyTorch. + +In Linux, run the the following: + sh benchmark_mha.sh + +In Windows, run the the following: + benchmark_mha.cmd """ +import argparse +import csv import math import os import platform import statistics import time -from typing import List, Optional +from contextlib import nullcontext +from datetime import datetime +from enum import IntEnum +from typing import Callable, Dict, List, Optional, Tuple import torch +import torch.utils.benchmark as benchmark from onnx import TensorProto, helper +from packaging.version import Version +from torch.nn.attention import SDPBackend, sdpa_kernel +from torch.nn.functional import scaled_dot_product_attention -from onnxruntime import InferenceSession, get_available_providers +from onnxruntime import InferenceSession, SessionOptions, get_available_providers from onnxruntime.transformers.io_binding_helper import CudaSession @@ -43,6 +57,20 @@ def get_name_list() -> List[str]: return ["Q,K,V", "QKV", "Q,KV", "Q,K',V'"] +class SdpaKernel(IntEnum): + """Bit flags for sdpa_kernel CUDA provider option""" + + DEFAULT = 0 + FLASH_ATTENTION = 1 + EFFICIENT_ATTENTION = 2 + TRT_FUSED_ATTENTION = 4 + CUDNN_FLASH_ATTENTION = 8 + MATH = 16 + TRT_FLASH_ATTENTION = 32 + TRT_CROSS_ATTENTION = 64 + TRT_CAUSAL_ATTENTION = 128 + + class MultiHeadAttentionConfig: def __init__( self, @@ -62,6 +90,7 @@ def __init__( use_kv_cache: bool = False, share_past_present_buffer: bool = False, input_format: int = InputFormats.Q_K_V_BSNH_BSNH_BSNH, + verbose: bool = False, ): self.operator = "MultiHeadAttention" self.batch_size = batch_size @@ -100,6 +129,7 @@ def __init__( self.input_format = input_format self.is_packed_qkv = input_format == InputFormats.QKV_BSN3H self.is_packed_kv = input_format == InputFormats.Q_KV_BSNH_BSN2H + self.verbose = verbose def __repr__(self): return ( @@ -114,89 +144,93 @@ def __repr__(self): ) def shape_dict(self, input_format=None): + shapes: Dict[str, Tuple] = { + "output": (self.batch_size, self.sequence_length, self.num_heads * self.head_size), + } + input_format = input_format or self.input_format - if input_format == InputFormats.Q_K_V_BSNH_BNSH_BNSH: - # cross attention does not have past state - return { + if input_format == InputFormats.QKV_BSN3H: + shapes = { + **shapes, + "query": (self.batch_size, self.sequence_length, self.num_heads, 3, self.head_size), + } + elif input_format == InputFormats.Q_KV_BSNH_BSN2H: + shapes = { + **shapes, + "query": (self.batch_size, self.sequence_length, self.num_heads * self.head_size), + "key": (self.batch_size, self.sequence_length, self.num_heads, 2, self.head_size), + } + elif input_format == InputFormats.Q_K_V_BSNH_BSNH_BSNH: + shapes = { + **shapes, + "query": (self.batch_size, self.sequence_length, self.num_heads * self.head_size), + "key": (self.batch_size, self.sequence_length, self.num_heads * self.head_size), + "value": (self.batch_size, self.sequence_length, self.num_heads * self.head_size), + } + else: + assert input_format == InputFormats.Q_K_V_BSNH_BNSH_BNSH + shapes = { + **shapes, "query": (self.batch_size, self.sequence_length, self.num_heads * self.head_size), "key": (self.batch_size, self.num_heads, self.sequence_length, self.head_size), "value": (self.batch_size, self.num_heads, self.sequence_length, self.head_size), - "output": (self.batch_size, self.sequence_length, self.num_heads * self.head_size), } if self.use_kv_cache: + assert input_format != InputFormats.Q_K_V_BSNH_BNSH_BNSH, "cross attention shall not have past state" shapes = { + **shapes, "past_key": (self.batch_size, self.num_heads, self.past_buffer_length, self.head_size), "past_value": (self.batch_size, self.num_heads, self.past_buffer_length, self.head_size), - "output": (self.batch_size, self.sequence_length, self.num_heads * self.head_size), "present_key": (self.batch_size, self.num_heads, self.present_buffer_length, self.head_size), "present_value": (self.batch_size, self.num_heads, self.present_buffer_length, self.head_size), } - else: - shapes = { - "output": (self.batch_size, self.sequence_length, self.num_heads * self.head_size), - } - if input_format == InputFormats.QKV_BSN3H: - shapes.update({"query": (self.batch_size, self.sequence_length, self.num_heads, 3, self.head_size)}) - elif input_format == InputFormats.Q_KV_BSNH_BSN2H: - shapes.update( - { - "query": (self.batch_size, self.sequence_length, self.num_heads * self.head_size), - "key": (self.batch_size, self.sequence_length, self.num_heads, 2, self.head_size), - } - ) - else: # input_format == InputFormats.Q_K_V_BSNH_BSNH_BSNH - shapes.update( - { - "query": (self.batch_size, self.sequence_length, self.num_heads * self.head_size), - "key": (self.batch_size, self.sequence_length, self.num_heads * self.head_size), - "value": (self.batch_size, self.sequence_length, self.num_heads * self.head_size), - } - ) return shapes def symbolic_shape_dict(self, input_format=None): + shapes: Dict[str, Tuple] = { + "output": ("batch_size", "sequence_length", self.num_heads * self.head_size), + } + input_format = input_format or self.input_format - if input_format == InputFormats.Q_K_V_BSNH_BNSH_BNSH: - # cross attention does not have past state - return { + if input_format == InputFormats.QKV_BSN3H: + shapes = { + **shapes, + "query": ("batch_size", "sequence_length", self.num_heads, 3, self.head_size), + } + elif input_format == InputFormats.Q_KV_BSNH_BSN2H: + shapes = { + **shapes, + "query": ("batch_size", "sequence_length", self.num_heads * self.head_size), + "key": ("batch_size", "sequence_length", self.num_heads, 2, self.head_size), + } + elif input_format == InputFormats.Q_K_V_BSNH_BSNH_BSNH: + shapes = { + **shapes, + "query": ("batch_size", "sequence_length", self.num_heads * self.head_size), + "key": ("batch_size", "sequence_length", self.num_heads * self.head_size), + "value": ("batch_size", "sequence_length", self.num_heads * self.head_size), + } + else: + assert input_format == InputFormats.Q_K_V_BSNH_BNSH_BNSH + shapes = { + **shapes, "query": ("batch_size", "sequence_length", self.num_heads * self.head_size), "key": ("batch_size", self.num_heads, "sequence_length", self.head_size), "value": ("batch_size", self.num_heads, "sequence_length", self.head_size), - "output": ("batch_size", "sequence_length", self.num_heads * self.head_size), } if self.use_kv_cache: + assert input_format != InputFormats.Q_K_V_BSNH_BNSH_BNSH, "cross attention shall not have past state" shapes = { + **shapes, "past_key": ("batch_size", self.num_heads, "past_buffer_length", self.head_size), "past_value": ("batch_size", self.num_heads, "past_buffer_length", self.head_size), - "output": ("batch_size", "sequence_length", self.num_heads * self.head_size), "present_key": ("batch_size", self.num_heads, "present_buffer_length", self.head_size), "present_value": ("batch_size", self.num_heads, "present_buffer_length", self.head_size), } - else: - shapes = { - "output": ("batch_size", "sequence_length", self.num_heads * self.head_size), - } - if input_format == InputFormats.QKV_BSN3H: - shapes.update({"query": ("batch_size", "sequence_length", self.num_heads, 3, self.head_size)}) - elif input_format == InputFormats.Q_KV_BSNH_BSN2H: - shapes.update( - { - "query": ("batch_size", "sequence_length", self.num_heads * self.head_size), - "key": ("batch_size", "sequence_length", self.num_heads, 2, self.head_size), - } - ) - else: # input_format == InputFormats.Q_K_V_BSNH_BSNH_BSNH - shapes.update( - { - "query": ("batch_size", "sequence_length", self.num_heads * self.head_size), - "key": ("batch_size", "sequence_length", self.num_heads * self.head_size), - "value": ("batch_size", "sequence_length", self.num_heads * self.head_size), - } - ) return shapes def random_inputs(self, seed: int = 123): @@ -215,44 +249,42 @@ def random_inputs(self, seed: int = 123): k_bnsh = k.transpose(1, 2) v_bnsh = v.transpose(1, 2) - if self.input_format == InputFormats.Q_K_V_BSNH_BNSH_BNSH: - return { + if self.input_format == InputFormats.Q_K_V_BSNH_BSNH_BSNH: + feeds = { "query": q.reshape(shape_dict["query"]), - "key": k_bnsh.contiguous(), - "value": v_bnsh.contiguous(), + "key": k.reshape(shape_dict["key"]), + "value": v.reshape(shape_dict["value"]), } - - feeds = {} - if self.use_kv_cache: - feeds.update( - { - "past_key": torch.empty(shape_dict["past_key"], device=device, dtype=dtype).normal_( - mean=0, std=0.1 - ), - "past_value": torch.empty(shape_dict["past_value"], device=device, dtype=dtype).normal_( - mean=0, std=0.1 - ), - } - ) - - if self.input_format == InputFormats.Q_K_V_BSNH_BSNH_BSNH: - feeds.update( - { - "query": q.reshape(shape_dict["query"]), - "key": k.reshape(shape_dict["key"]), - "value": v.reshape(shape_dict["value"]), - } - ) elif self.input_format == InputFormats.QKV_BSN3H: query = q.view(self.batch_size * self.sequence_length, self.num_heads, self.head_size) key = k.view(self.batch_size * self.sequence_length, self.num_heads, self.head_size) value = v.view(self.batch_size * self.sequence_length, self.num_heads, self.head_size) - feeds["query"] = torch.dstack((query, key, value)).reshape(shape_dict["query"]).contiguous() + feeds = { + "query": torch.dstack((query, key, value)).reshape(shape_dict["query"]).contiguous(), + } elif self.input_format == InputFormats.Q_KV_BSNH_BSN2H: key = k.view(self.batch_size * self.sequence_length, self.num_heads, self.head_size) value = v.view(self.batch_size * self.sequence_length, self.num_heads, self.head_size) - feeds["query"] = q.reshape(shape_dict["query"]) - feeds["key"] = torch.dstack((key, value)).reshape(shape_dict["key"]).contiguous() + feeds = { + "query": q.reshape(shape_dict["query"]), + "key": torch.dstack((key, value)).reshape(shape_dict["key"]).contiguous(), + } + else: + assert self.input_format == InputFormats.Q_K_V_BSNH_BNSH_BNSH + feeds = { + "query": q.reshape(shape_dict["query"]), + "key": k_bnsh.contiguous(), + "value": v_bnsh.contiguous(), + } + + if self.use_kv_cache: + feeds = { + **feeds, + "past_key": torch.empty(shape_dict["past_key"], device=device, dtype=dtype).normal_(mean=0, std=0.1), + "past_value": torch.empty(shape_dict["past_value"], device=device, dtype=dtype).normal_( + mean=0, std=0.1 + ), + } return feeds @@ -318,19 +350,32 @@ def create_multi_head_attention_onnx_model(config: MultiHeadAttentionConfig, use return model.SerializeToString() -def create_session( +def create_ort_session( config: MultiHeadAttentionConfig, + session_options=None, + attention_kernel=SdpaKernel.DEFAULT, + use_symbolic_shape: bool = True, ) -> CudaSession: - onnx_model_str = create_multi_head_attention_onnx_model(config) + if config.verbose: + print(f"create session for {vars(config)}") + onnx_model_str = create_multi_head_attention_onnx_model(config, use_symbolic_shape=use_symbolic_shape) if config.provider == "CUDAExecutionProvider": device_id = torch.cuda.current_device() if isinstance(config.device, str) else config.device.index provider_options = CudaSession.get_cuda_provider_options(device_id, config.enable_cuda_graph) + provider_options["sdpa_kernel"] = int(attention_kernel) providers = [(config.provider, provider_options), "CPUExecutionProvider"] else: providers = ["CPUExecutionProvider"] - ort_session = InferenceSession(onnx_model_str, providers=providers) + ort_session = InferenceSession(onnx_model_str, session_options, providers=providers) + return ort_session + + +def create_session( + config: MultiHeadAttentionConfig, session_options=None, attention_kernel=SdpaKernel.DEFAULT +) -> CudaSession: + ort_session = create_ort_session(config, session_options, attention_kernel, use_symbolic_shape=False) cuda_session = CudaSession(ort_session, config.device, config.enable_cuda_graph) shape_dict = config.shape_dict() cuda_session.allocate_buffers(shape_dict) @@ -340,11 +385,8 @@ def create_session( class OrtMultiHeadAttention: """A wrapper of ORT MultiHeadAttention to test relevance and performance.""" - def __init__( - self, - config: MultiHeadAttentionConfig, - ): - self.ort_session = create_session(config) + def __init__(self, config: MultiHeadAttentionConfig, session_options=None): + self.ort_session = create_session(config, session_options) self.feed_dict = config.random_inputs() def infer(self): @@ -363,53 +405,90 @@ def flops(batch, sequence_length, head_size, num_heads, causal): def tflops_per_second(flop, time): - return (flop / time / 10**12) if not math.isnan(time) else 0.0 - - -def get_gpu_kernel_name(config: MultiHeadAttentionConfig) -> str: - # This classification is for Nvidia GPU of Compute Capability 8.* like A100. - # Note that some kernel might not exist in older or newer GPUs. - if os.getenv("ORT_DISABLE_FLASH_ATTENTION") != "1": - if config.input_format == InputFormats.QKV_BSN3H: - min_seq_len = os.getenv("ORT_MIN_SEQ_LEN_FLASH_ATTENTION_PACKED_QKV") - min_length = int(min_seq_len) if min_seq_len is not None else 513 - if config.sequence_length >= min_length: - return "Flash" - else: - return "Flash" + try: + return (flop / time / 10**12) if not math.isnan(time) else 0.0 + except ZeroDivisionError: + return None + + +def get_gpu_kernel_name(attention_kernel: SdpaKernel) -> str: + kernel_names = { + SdpaKernel.DEFAULT: "ort:default", + SdpaKernel.FLASH_ATTENTION: "ort:flash", + SdpaKernel.EFFICIENT_ATTENTION: "ort:efficient", + SdpaKernel.CUDNN_FLASH_ATTENTION: "ort:cudnn", + SdpaKernel.MATH: "ort:math", + } + assert attention_kernel in kernel_names + return kernel_names[attention_kernel] - if (os.getenv("ORT_DISABLE_FUSED_CROSS_ATTENTION") != "1" and config.kv_sequence_length <= 128) or ( - os.getenv("ORT_DISABLE_FUSED_ATTENTION") != "1" - and (config.sequence_length <= 384 or os.getenv("ORT_DISABLE_TRT_FLASH_ATTENTION") != "1") - ): - return "TRT" - if os.getenv("ORT_DISABLE_MEMORY_EFFICIENT_ATTENTION") != "1": - return "MemEff" +def get_cpu_kernel_name(config: MultiHeadAttentionConfig) -> str: + # CPU Flash Attention does not support causal and kv cache etc. + if not (config.causal or config.use_kv_cache or config.past_sequence_length > 0): + if os.getenv("ORT_DISABLE_FLASH_ATTENTION") != "1": + return "ort:flash" - return "Unfused" + return "ort:math" -def get_cpu_kernel_name() -> str: - if os.getenv("ORT_DISABLE_FLASH_ATTENTION") != "1": - return "CPU:Flash" - return "CPU:Unfused" +# ------------------------------------------------------------------ +# Functions for benchmarking PyTorch SDPA +# ------------------------------------------------------------------ +def benchmark_torch_function(func: Callable, *args, **kwargs) -> float: + warmup = 5 + repeats = 100 + for _ in range(warmup): + func(*args, **kwargs) + timer = benchmark.Timer( + stmt="func(*args, **kwargs)", + globals={"args": args, "kwargs": kwargs, "func": func}, + ) + + return timer.timeit(number=repeats).median -def run_tflops_test(use_gpu: bool = True, enable_cuda_graph: bool = False, repeats: int = 100): - if use_gpu: - device_id = torch.cuda.current_device() - device = torch.device("cuda", device_id) - formats = [InputFormats.Q_K_V_BSNH_BSNH_BSNH, InputFormats.Q_KV_BSNH_BSN2H, InputFormats.QKV_BSN3H] - provider = "CUDAExecutionProvider" - print(f"enable_cuda_graph={enable_cuda_graph}") - else: - device_id = 0 - device = torch.device("cpu") - formats = [InputFormats.Q_K_V_BSNH_BSNH_BSNH] - enable_cuda_graph = False - provider = "CPUExecutionProvider" +def run_torch_sdpa( + batch_size: int, + q_seq_len: int, + kv_seq_len: int, + num_heads: int, + head_size: int, + causal: bool, + device, + dtype, + has_mask: bool = False, + mask_dim: int = 2, + mask_dtype=torch.bool, + backend: Optional[int] = None, +): + q_shape = (batch_size, num_heads, q_seq_len, head_size) + kv_shape = (batch_size, num_heads, kv_seq_len, head_size) + q = torch.randn(q_shape, device=device, dtype=dtype) + k = torch.randn(kv_shape, device=device, dtype=dtype) + v = torch.randn(kv_shape, device=device, dtype=dtype) + + attn_mask = None + if has_mask: + mask_shape = (batch_size, num_heads, q_seq_len, kv_seq_len) if mask_dim == 4 else (q_seq_len, kv_seq_len) + attn_mask = torch.ones(mask_shape, dtype=mask_dtype, device=device) + + context = sdpa_kernel(backend) if backend is not None else nullcontext() + + with context: + average_latency = benchmark_torch_function( + scaled_dot_product_attention, + q, + k, + v, + is_causal=causal, + attn_mask=attn_mask, + ) + return average_latency + + +def get_test_configs(use_gpu: bool = True): if use_gpu: # (batch_size, sequence_length, past_sequence_length, num_heads, head_size, run_unfused) configs = [ @@ -450,31 +529,70 @@ def run_tflops_test(use_gpu: bool = True, enable_cuda_graph: bool = False, repea ] else: configs = [ + # TNLGv4 (1, 128, 0, 32, 128, True), (1, 256, 0, 32, 128, True), (1, 512, 0, 32, 128, True), (1, 1024, 0, 32, 128, True), - (1, 2048, 0, 32, 128, True), + # (1, 2048, 0, 32, 128, True), + # bert-base + (1, 128, 0, 12, 64, True), + (1, 384, 0, 12, 64, True), + (1, 512, 0, 12, 64, True), + (4, 128, 0, 12, 64, True), + (4, 384, 0, 12, 64, True), + (4, 512, 0, 12, 64, True), + # bert-large + (1, 128, 0, 16, 64, True), + (1, 384, 0, 16, 64, True), + (1, 512, 0, 16, 64, True), + (4, 128, 0, 16, 64, True), + (4, 384, 0, 16, 64, True), + (4, 512, 0, 16, 64, True), ] + return configs + + +def get_compute_capability(): + assert torch.cuda.is_available() + major, minor = torch.cuda.get_device_capability() + sm = major * 10 + minor + return sm - # List of environment variables to enable/disable attention kernels - print("Environment Variables:") - env_names = [ - "ORT_DISABLE_FLASH_ATTENTION", - "ORT_MIN_SEQ_LEN_FLASH_ATTENTION_PACKED_QKV", - "ORT_DISABLE_FUSED_ATTENTION", - "ORT_DISABLE_TRT_FLASH_ATTENTION", - "ORT_ENABLE_FUSED_CAUSAL_ATTENTION", - "ORT_DISABLE_FUSED_CROSS_ATTENTION", - "ORT_DISABLE_MEMORY_EFFICIENT_ATTENTION", - ] - for name in env_names: - value = os.getenv(name) - if value is not None: - print(f"{name}={value}") - print("\nformat\tcausal\tbatch\tseqlen\theads\th_dim\tms\tTFLOPS\tkernel") - causal = False +def run_tflops_test( + csv_writer: csv.DictWriter, + use_gpu: bool = True, + enable_cuda_graph: bool = False, + causal: bool = False, + has_past: bool = False, + intra_op_num_threads: int = 0, + repeats: int = 100, +): + print(f"run_tflops_test: causal={causal}") + + if use_gpu: + device_id = torch.cuda.current_device() + device = torch.device("cuda", device_id) + formats = [InputFormats.Q_K_V_BSNH_BSNH_BSNH, InputFormats.Q_KV_BSNH_BSN2H, InputFormats.QKV_BSN3H] + provider = "CUDAExecutionProvider" + # flash attention is available for sm >= 80 + sm = get_compute_capability() + if sm >= 80: + backends = [SdpaKernel.DEFAULT, SdpaKernel.FLASH_ATTENTION, SdpaKernel.EFFICIENT_ATTENTION] + else: + backends = [SdpaKernel.DEFAULT, SdpaKernel.EFFICIENT_ATTENTION] + else: + device_id = 0 + device = torch.device("cpu") + formats = [InputFormats.Q_K_V_BSNH_BSNH_BSNH] + enable_cuda_graph = False + provider = "CPUExecutionProvider" + backends = [SdpaKernel.DEFAULT] + + configs = get_test_configs(use_gpu) + + print("\nformat\tcausal\tprompt\tbatch\tseqlen\theads\th_dim\tthreads\tms\tTFLOPS\tkernel") for input_format in formats: for batch_size, sequence_length, past_sequence_length, num_heads, head_size, enable_unfused in configs: @@ -496,21 +614,27 @@ def run_tflops_test(use_gpu: bool = True, enable_cuda_graph: bool = False, repea share_past_present_buffer=False, input_format=input_format, ) - - session = create_session(config) + for attention_kernel in backends: + sess_options = SessionOptions() + sess_options.intra_op_num_threads = intra_op_num_threads + session = create_session(config, sess_options, attention_kernel=attention_kernel) if use_gpu: - kernel = get_gpu_kernel_name(config) + kernel = get_gpu_kernel_name(attention_kernel) else: - kernel = get_cpu_kernel_name() + kernel = get_cpu_kernel_name(config) - if kernel == "Unfused": + if "math" in kernel: # Skip large sequence length for Unfused kernel to avoid OOM. if not enable_unfused: + if config.verbose: + print(f"skip unfused kernel for {vars(config)}") continue # Unfused kernel does not support packed QKV or packed KV formats. if input_format not in [InputFormats.Q_K_V_BSNH_BSNH_BSNH]: + if config.verbose: + print(f"skip input_format for {vars(config)}") continue input_dict = config.random_inputs() @@ -526,19 +650,168 @@ def run_tflops_test(use_gpu: bool = True, enable_cuda_graph: bool = False, repea del session + format_str = InputFormats.input_format_str(input_format) + # compute TFLOPS per second - speed = tflops_per_second( - flops(batch_size, sequence_length, head_size, num_heads, causal), average_latency - ) + speed = None + if past_sequence_length == 0: + speed = tflops_per_second( + flops(batch_size, sequence_length, head_size, num_heads, causal), average_latency + ) + + row = { + "use_gpu": use_gpu, + "enable_cuda_graph": enable_cuda_graph, + "format": format_str, + "causal": causal, + "batch_size": batch_size, + "sequence_length": sequence_length, + "past_sequence_length": past_sequence_length, + "num_heads": num_heads, + "head_size": head_size, + "intra_op_num_threads": intra_op_num_threads, + "average_latency": average_latency, + "tflops": speed, + "kernel": kernel, + } + csv_writer.writerow(row) - format = InputFormats.input_format_str(input_format) + speed = f"{speed:.2f}" if speed is not None else "NA" print( - f"{format}\t{causal}\t{batch_size}\t{sequence_length}\t{num_heads}\t{head_size}\t{average_latency * 1000:.2f}\t{speed:.2f}\t{kernel}" + f"{format_str}\t{causal}\t{not has_past}\t{batch_size}\t{sequence_length}\t{num_heads}\t{head_size}\t" + f"{intra_op_num_threads}\t{average_latency * 1000:.2f}\t{speed}\t{kernel}" ) +def run_torch_test( + csv_writer: csv.DictWriter, + use_gpu: bool = True, + causal: bool = False, +): + configs = get_test_configs(use_gpu) + + if use_gpu: + if not torch.cuda.is_available(): + return + device_id = torch.cuda.current_device() + device = torch.device("cuda", device_id) + dtype = torch.float16 + backends = [ + None, + SDPBackend.FLASH_ATTENTION, + SDPBackend.EFFICIENT_ATTENTION, + SDPBackend.CUDNN_ATTENTION, + SDPBackend.MATH, + ] + else: + device = torch.device("cpu") + dtype = torch.float32 + backends = [None] + + backend_names = { + SDPBackend.FLASH_ATTENTION: "torch:flash", + SDPBackend.EFFICIENT_ATTENTION: "torch:efficient", + SDPBackend.CUDNN_ATTENTION: "torch:cudnn", + SDPBackend.MATH: "torch:math", + None: "torch:default", + } + + # Test PyTorch latency + for batch_size, sequence_length, past_sequence_length, num_heads, head_size, enable_unfused in configs: + for backend in backends: + if backend == SDPBackend.MATH and not enable_unfused: + continue + if backend == SDPBackend.FLASH_ATTENTION and platform.system() != "Linux": + continue + + backend_name = backend_names[backend] + try: + with torch.no_grad(): + torch_latency = run_torch_sdpa( + batch_size, + sequence_length, + sequence_length, + num_heads, + head_size, + causal, + has_mask=False, + mask_dim=2, + mask_dtype=torch.bool, + device=device, + dtype=dtype, + backend=backend, + ) + except RuntimeError: + continue + + speed = tflops_per_second(flops(batch_size, sequence_length, head_size, num_heads, causal), torch_latency) + input_format = "Q,K,V" + print( + f"{input_format}\t{causal}\t{batch_size}\t{sequence_length}\t{num_heads}\t{head_size}\t" + f"{0}\t{torch_latency * 1000:.2f}\t{speed:.2f}\t{backend_name}" + ) + row = { + "use_gpu": use_gpu, + "enable_cuda_graph": False, + "format": input_format, + "causal": causal, + "batch_size": batch_size, + "sequence_length": sequence_length, + "past_sequence_length": past_sequence_length, + "num_heads": num_heads, + "head_size": head_size, + "intra_op_num_threads": torch.get_num_threads(), + "average_latency": torch_latency, + "tflops": speed, + "kernel": backend_name, + } + csv_writer.writerow(row) + + +def run_tflops_tests(args): + features = "gpu" if args.use_gpu else "cpu" + if args.causal: + features += "_causal" + if args.has_past: + features += "_past" + csv_filename = "benchmark_mha_{}_{}_{}.csv".format( + features, + "torch" if args.torch else "ort", + datetime.now().strftime("%Y%m%d-%H%M%S"), + ) + with open(csv_filename, mode="a", newline="") as csv_file: + column_names = [ + "use_gpu", + "enable_cuda_graph", + "format", + "causal", + "batch_size", + "sequence_length", + "past_sequence_length", + "num_heads", + "head_size", + "intra_op_num_threads", + "average_latency", + "tflops", + "kernel", + ] + csv_writer = csv.DictWriter(csv_file, fieldnames=column_names) + csv_writer.writeheader() + + if args.torch: + run_torch_test(csv_writer, args.use_gpu, args.causal) + else: + run_tflops_test( + csv_writer, + use_gpu=args.use_gpu, + enable_cuda_graph=args.use_cuda_graph, + causal=args.causal, + has_past=args.has_past, + intra_op_num_threads=args.intra_op_num_threads, + ) + + def plot_prompt_performance( - sm: int, model_name: str, batch_size: int, num_heads: int, @@ -558,6 +831,7 @@ def plot_prompt_performance( "styles": [("red", "solid"), ("yellow", "dashdot"), ("blue", "dashed"), ("green", "dotted")][0 : len(formats)], } + sm = get_compute_capability() configs = [ triton.testing.Benchmark( x_names=["sequence_length"], @@ -591,13 +865,14 @@ def benchmark( sequence_length=sequence_length, num_heads=num_heads, head_size=head_size, - causal=True, + causal=False, past_sequence_length=0, kv_sequence_length=sequence_length if input_format == InputFormats.get_name_list()[-1] else None, max_cache_sequence_length=max_seq_len, provider="CUDAExecutionProvider", enable_cuda_graph=False, device=device, + dtype=torch.float16, use_kv_cache=False, input_format=InputFormats.convert(input_format), ) @@ -609,14 +884,14 @@ def benchmark( benchmark.run(save_path=".", print_data=True) -def run_performance_test(sm: int): +def run_bert_performance_test(): """ Run performance tests for prompt and token generation. """ configures = [ - (1, 32, 128, 8192, "TNLGv4"), - (4, 32, 128, 8192, "TNLGv4"), + # (1, 32, 128, 8192, "TNLGv4"), + # (4, 32, 128, 8192, "TNLGv4"), (1, 12, 64, 1024, "BertBase"), (16, 12, 64, 1024, "BertBase"), (1, 16, 64, 1024, "BertLarge"), @@ -625,7 +900,6 @@ def run_performance_test(sm: int): for batch_size, num_heads, head_size, max_seq_len, model_name in configures: plot_prompt_performance( - sm=sm, batch_size=batch_size, num_heads=num_heads, head_size=head_size, @@ -634,18 +908,84 @@ def run_performance_test(sm: int): ) +def _parse_arguments(): + parser = argparse.ArgumentParser(description="Benchmark MultiHeadAttention for ONNX Runtime and PyTorch.") + + parser.add_argument( + "--use_gpu", + required=False, + action="store_true", + help="Use GPU for inference.", + ) + parser.set_defaults(use_gpu=False) + + parser.add_argument( + "--use_cuda_graph", + required=False, + action="store_true", + help="Use cuda graph in onnxruntime.", + ) + parser.set_defaults(use_cuda_graph=False) + + parser.add_argument( + "--intra_op_num_threads", + required=False, + type=int, + choices=[0, 1, 2, 4, 8, 16], + default=0, + help="intra_op_num_threads for onnxruntime. ", + ) + + parser.add_argument( + "--has_past", + required=False, + action="store_true", + help="whether past_sequence_length > 0", + ) + parser.set_defaults(has_past=False) + + parser.add_argument( + "--causal", + required=False, + action="store_true", + help="test unidirectional", + ) + parser.set_defaults(causal=False) + + parser.add_argument( + "--torch", + required=False, + action="store_true", + help="test pytorch instead of onnxruntime", + ) + parser.set_defaults(torch=False) + + args = parser.parse_args() + + return args + + if __name__ == "__main__": - if torch.cuda.is_available() and "CUDAExecutionProvider" in get_available_providers(): - # Test CUDA provider - major, minor = torch.cuda.get_device_capability() - sm = major * 10 + minor + args = _parse_arguments() + print(f"arguments:{args}") + + if args.has_past: + assert args.causal, "--has_past need --causal specified" + + if args.use_gpu: + assert args.torch or not args.causal, "no causal cuda kernel in MHA op" + assert torch.cuda.is_available() + if not args.torch: + assert "CUDAExecutionProvider" in get_available_providers() + if args.torch: + assert Version(torch.__version__) >= Version("2.3.0") + assert args.has_past is False + + if args.use_gpu and not args.torch: if platform.system() == "Linux": s = torch.cuda.Stream() with torch.cuda.stream(s), torch.no_grad(): - run_performance_test(sm) - - run_tflops_test(use_gpu=True, enable_cuda_graph=True) + run_bert_performance_test() - # Test CPU provider - run_tflops_test(use_gpu=False, enable_cuda_graph=False) + run_tflops_tests(args) diff --git a/onnxruntime/test/python/transformers/benchmark_mha.sh b/onnxruntime/test/python/transformers/benchmark_mha.sh index 7b21cf1cc1e08..613543d0172dd 100644 --- a/onnxruntime/test/python/transformers/benchmark_mha.sh +++ b/onnxruntime/test/python/transformers/benchmark_mha.sh @@ -1,14 +1,40 @@ -echo "flash attention v2" -ORT_DISABLE_FLASH_ATTENTION=0 ORT_MIN_SEQ_LEN_FLASH_ATTENTION_PACKED_QKV=0 python benchmark_mha.py | tee result.txt +#!/bin/sh -echo "===" -echo "TensorRT attention kernels - cross attention (when kv_seq_len <= 128) or fused attention (when seq_len <= 384) or flash attention (seq_len > 384)" -ORT_DISABLE_FLASH_ATTENTION=1 python benchmark_mha.py | tee -a result.txt +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------------------- -echo "===" -echo "Memory Efficient attention" -ORT_DISABLE_FLASH_ATTENTION=1 ORT_DISABLE_TRT_FLASH_ATTENTION=1 ORT_DISABLE_FUSED_ATTENTION=1 ORT_DISABLE_FUSED_CROSS_ATTENTION=1 python benchmark_mha.py | tee -a result.txt +echo "Benchmark Scaled Dot Product Attention (SDPA) performance on GPU:" -echo "===" -echo "Unfused Attention (some configurations might fail)" -ORT_DISABLE_FLASH_ATTENTION=1 ORT_DISABLE_TRT_FLASH_ATTENTION=1 ORT_DISABLE_FUSED_ATTENTION=1 ORT_DISABLE_FUSED_CROSS_ATTENTION=1 ORT_DISABLE_MEMORY_EFFICIENT_ATTENTION=1 python benchmark_mha.py | tee -a result.txt +export CUDA_VISIBLE_DEVICES=0 +python benchmark_mha.py --use_gpu +python benchmark_mha.py --use_gpu --use_cuda_graph +python benchmark_mha.py --use_gpu --torch + +cat benchmark_mha_gpu_*.csv > mha_gpu_benchmark_results.csv + +echo "Benchmark performance on CPU with number of threads:" +MKL_DYNAMIC=FALSE OMP_NUM_THREADS=1 python benchmark_mha.py --torch +MKL_DYNAMIC=FALSE OMP_NUM_THREADS=2 python benchmark_mha.py --torch +MKL_DYNAMIC=FALSE OMP_NUM_THREADS=4 python benchmark_mha.py --torch +MKL_DYNAMIC=FALSE OMP_NUM_THREADS=8 python benchmark_mha.py --torch + +python benchmark_mha.py --intra_op_num_threads 1 +python benchmark_mha.py --intra_op_num_threads 2 +python benchmark_mha.py --intra_op_num_threads 4 +python benchmark_mha.py --intra_op_num_threads 8 + + +echo "Benchmark performance on CPU with default threads settings:" +python benchmark_mha.py +ORT_DISABLE_FLASH_ATTENTION=1 python benchmark_mha.py +python benchmark_mha.py --torch + +python benchmark_mha.py --causal +python benchmark_mha.py --torch --causal + +# Pytorch SDPA does not support causal attention with past state, we only test ORT here. +python benchmark_mha.py --causal --has_past + +cat benchmark_mha_cpu_*.csv > mha_cpu_benchmark_results.csv diff --git a/onnxruntime/test/python/transformers/test_mha.py b/onnxruntime/test/python/transformers/test_mha.py index ff473cc2ced92..0fcbd889847e9 100644 --- a/onnxruntime/test/python/transformers/test_mha.py +++ b/onnxruntime/test/python/transformers/test_mha.py @@ -10,36 +10,15 @@ import concurrent.futures import itertools import unittest -from enum import IntEnum from typing import Dict, List, Optional import numpy import torch -from benchmark_mha import ( - InputFormats, - MultiHeadAttentionConfig, - OrtMultiHeadAttention, - create_multi_head_attention_onnx_model, -) +from benchmark_mha import InputFormats, MultiHeadAttentionConfig, OrtMultiHeadAttention, SdpaKernel, create_ort_session from einops import rearrange from parameterized import parameterized import onnxruntime -from onnxruntime import InferenceSession - - -class SdpaKernel(IntEnum): - """Bit flags for sdpa_kernel CUDA provider option""" - - DEFAULT = 0 - FLASH_ATTENTION = 1 - EFFICIENT_ATTENTION = 2 - TRT_FUSED_ATTENTION = 4 - CUDNN_FLASH_ATTENTION = 8 - MATH = 16 - TRT_FLASH_ATTENTION = 32 - TRT_CROSS_ATTENTION = 64 - TRT_CAUSAL_ATTENTION = 128 def attention_reference( @@ -466,7 +445,7 @@ def parity_check_mha_multi_threading( test_inputs: List[Dict], rtol: float = 1e-3, atol: float = 1e-3, - sdpa_kernel: int = SdpaKernel.DEFAULT, + attention_kernel: int = SdpaKernel.DEFAULT, max_threads: int = 5, verbose: bool = False, ): @@ -476,21 +455,14 @@ def parity_check_mha_multi_threading( if config.causal and config.provider == "CUDAExecutionProvider": return None # Some kernel does not support certain input format. - if sdpa_kernel not in [ + if attention_kernel not in [ SdpaKernel.DEFAULT, SdpaKernel.FLASH_ATTENTION, SdpaKernel.EFFICIENT_ATTENTION, ] and config.input_format in [InputFormats.Q_KV_BSNH_BSN2H]: return None - if verbose: - print(f"create a shared session with {vars(config)}") - onnx_model_str = create_multi_head_attention_onnx_model(config, use_symbolic_shape=True) - if config.provider == "CUDAExecutionProvider": - provider_options = {"arena_extend_strategy": "kSameAsRequested", "sdpa_kernel": int(sdpa_kernel)} - providers = [(config.provider, provider_options), "CPUExecutionProvider"] - else: - providers = ["CPUExecutionProvider"] - ort_session = InferenceSession(onnx_model_str, providers=providers) + + ort_session = create_ort_session(config, attention_kernel=attention_kernel, use_symbolic_shape=True) def convert_to_ort_inputs(feed_dict): ort_inputs = {} @@ -613,7 +585,7 @@ def test_mha_cuda(self, config): def test_mha_cpu(self, config): parity_check_mha(config) - def run_mha_cuda_multi_threading(self, spda_kernel): + def run_mha_cuda_multi_threading(self, attention_kernel): for configs in multi_thread_test_cases("CUDAExecutionProvider", comprehensive_mode): test_inputs = [] for config in configs: @@ -626,8 +598,10 @@ def run_mha_cuda_multi_threading(self, spda_kernel): config.input_format = old_format test_inputs.append({"config": config, "ort_inputs": ort_inputs, "ref_inputs": ref_inputs}) - exception = parity_check_mha_multi_threading(test_inputs, sdpa_kernel=spda_kernel, max_threads=len(configs)) - assert exception is None, f"{spda_kernel=}, {vars(configs[0])}, {exception}" + exception = parity_check_mha_multi_threading( + test_inputs, attention_kernel=attention_kernel, max_threads=len(configs) + ) + assert exception is None, f"{attention_kernel=}, {vars(configs[0])}, {exception}" def test_mha_cuda_multi_threading(self): self.run_mha_cuda_multi_threading(SdpaKernel.DEFAULT) From 5af423c7c0561d3861a6b8ed5598abef02715e28 Mon Sep 17 00:00:00 2001 From: Scott McKay Date: Sat, 27 Jul 2024 13:22:57 +1000 Subject: [PATCH 32/57] Set version and other info in the C# dll (#21517) ### Description Set version and other info in the Microsoft.ML.OnnxRuntime C# dll by setting GenerateAssemblyInfo to true and passing in ORT version in the CI. Minor re-org of the order of properties so related things are grouped a little better. ### Motivation and Context #21475 --- .../Microsoft.ML.OnnxRuntime.csproj | 67 +++++++++++-------- .../azure-pipelines/templates/c-api-cpu.yml | 4 +- 2 files changed, 42 insertions(+), 29 deletions(-) diff --git a/csharp/src/Microsoft.ML.OnnxRuntime/Microsoft.ML.OnnxRuntime.csproj b/csharp/src/Microsoft.ML.OnnxRuntime/Microsoft.ML.OnnxRuntime.csproj index 3c8a49bf93578..deb6b4f884bcf 100644 --- a/csharp/src/Microsoft.ML.OnnxRuntime/Microsoft.ML.OnnxRuntime.csproj +++ b/csharp/src/Microsoft.ML.OnnxRuntime/Microsoft.ML.OnnxRuntime.csproj @@ -23,7 +23,7 @@ + '$(IncludeMobileTargets)' == 'true'"> net8.0-android @@ -31,6 +31,43 @@ $(BaseTargets);$(MobileTargets) + + Microsoft.ML.OnnxRuntime + Microsoft.ML.OnnxRuntime + + + + 1.0.0 + 0.0.0 + + + + true + Microsoft.ML.OnnxRuntime C# Bindings + Microsoft + © Microsoft Corporation. All rights reserved. + This package contains ONNX Runtime for .Net platforms + + + $(PackageVersion) + + + + + Microsoft + Microsoft.ML.OnnxRuntime.Managed + ONNX;ONNX Runtime;Machine Learning + https://github.com/Microsoft/onnxruntime + LICENSE.txt + ORT_icon_for_light_bg.png + + Release Def: + Branch: $(BUILD_SOURCEBRANCH) + Commit: $(BUILD_SOURCEVERSION) + Build: https://aiinfra.visualstudio.com/Lotus/_build/results?buildId=$(BUILD_BUILDID) + + + AnyCPU;x86 default @@ -43,8 +80,6 @@ $(OnnxRuntimeRoot)\csharp x64 - Microsoft.ML.OnnxRuntime - Microsoft.ML.OnnxRuntime false false portable @@ -54,27 +89,8 @@ on their device is not built for training, an exception will be thrown with the following message - "Training is disabled in the current build. Please build onnxruntime from source with the build flags enable_training_apis. "--> - true + true - - - Microsoft.ML.OnnxRuntime.Managed - Microsoft - 1.0.0 - 0.0.0 - $(PackageVersion) - This package contains ONNX Runtime for .Net platforms - ONNX;ONNX Runtime;Machine Learning - https://github.com/Microsoft/onnxruntime - © Microsoft Corporation. All rights reserved. - LICENSE.txt - ORT_icon_for_light_bg.png - - Release Def: - Branch: $(BUILD_SOURCEBRANCH) - Commit: $(BUILD_SOURCEVERSION) - Build: https://aiinfra.visualstudio.com/Lotus/_build/results?buildId=$(BUILD_BUILDID) - true @@ -82,7 +98,6 @@ false - false $(AllowedOutputExtensionsInPackageBuildOutputFolder);.pdb Debug;Release;RelWithDebInfo @@ -158,10 +173,6 @@ $(OrtConstants);__ENABLE_COREML__ - - $(OrtConstants);__XAMARIN__ - - $(DefineConstants);$(OrtConstants) diff --git a/tools/ci_build/github/azure-pipelines/templates/c-api-cpu.yml b/tools/ci_build/github/azure-pipelines/templates/c-api-cpu.yml index 7ba1179e7ad4d..ec97da3786fd9 100644 --- a/tools/ci_build/github/azure-pipelines/templates/c-api-cpu.yml +++ b/tools/ci_build/github/azure-pipelines/templates/c-api-cpu.yml @@ -364,6 +364,8 @@ stages: workingDirectory: '$(Build.BinariesDirectory)/nuget-artifact' displayName: 'List artifacts' + - template: set-version-number-variables-step.yml + # Reconstruct the build dir - task: PowerShell@2 displayName: 'Extract native libraries for addition to nuget native package' @@ -403,7 +405,7 @@ stages: solution: '$(Build.SourcesDirectory)\csharp\OnnxRuntime.CSharp.sln' platform: 'Any CPU' configuration: RelWithDebInfo - msbuildArguments: '-p:OnnxRuntimeBuildDirectory="$(Build.BinariesDirectory)" -p:OrtPackageId=$(OrtPackageId) -p:IsReleaseBuild=${{ parameters.IsReleaseBuild }} -p:ReleaseVersionSuffix=$(ReleaseVersionSuffix)' + msbuildArguments: '-p:OnnxRuntimeBuildDirectory="$(Build.BinariesDirectory)" -p:OrtPackageId=$(OrtPackageId) -p:IsReleaseBuild=${{ parameters.IsReleaseBuild }} -p:ReleaseVersionSuffix=$(ReleaseVersionSuffix) -p:PackageVersion=$(OnnxRuntimeVersion)' workingDirectory: '$(Build.SourcesDirectory)\csharp' - ${{ if eq(parameters.DoEsrp, true) }}: From 690d745cbff6f540f95e668be21da76873689a32 Mon Sep 17 00:00:00 2001 From: zz002 Date: Sat, 27 Jul 2024 11:28:55 +0800 Subject: [PATCH 33/57] [VitisAI] 1. KernelDef supports StartVersion and EndVersion (#21519) ### Description [VitisAI] 1. KernelDef supports StartVersion and EndVersion 2. CapabilityOps checks domain ### Motivation and Context Co-authored-by: Zhenze Wang --- onnxruntime/core/providers/vitisai/imp/capability.cc | 6 +++++- onnxruntime/core/providers/vitisai/imp/global_api.cc | 4 ++-- .../core/providers/vitisai/vitisai_execution_provider.cc | 2 +- 3 files changed, 8 insertions(+), 4 deletions(-) diff --git a/onnxruntime/core/providers/vitisai/imp/capability.cc b/onnxruntime/core/providers/vitisai/imp/capability.cc index 58522a45a151e..6d188076fe613 100644 --- a/onnxruntime/core/providers/vitisai/imp/capability.cc +++ b/onnxruntime/core/providers/vitisai/imp/capability.cc @@ -51,7 +51,11 @@ GetComputeCapabilityOps(const onnxruntime::GraphViewer& graph, std::vector node_indexs = graph.GetNodesInTopologicalOrder(); node_indexs.erase(std::remove_if(node_indexs.begin(), node_indexs.end(), [&](NodeIndex index) { return all_nodes_included_eps.count(index) > 0; }), node_indexs.end()); - node_indexs.erase(std::remove_if(node_indexs.begin(), node_indexs.end(), [&](NodeIndex index) { return all_support_optypes_by_eps.count(graph.GetNode(index)->OpType()) == 0; }), node_indexs.end()); + node_indexs.erase(std::remove_if(node_indexs.begin(), node_indexs.end(), + [&](NodeIndex index) { + auto node = graph.GetNode(index); + return all_support_optypes_by_eps.count(node->Domain() + ":" + node->OpType()) == 0; }), + node_indexs.end()); std::vector> result; for (auto& n : node_indexs) { diff --git a/onnxruntime/core/providers/vitisai/imp/global_api.cc b/onnxruntime/core/providers/vitisai/imp/global_api.cc index 8c1dce0d3dc1a..a86a4fb61d54d 100644 --- a/onnxruntime/core/providers/vitisai/imp/global_api.cc +++ b/onnxruntime/core/providers/vitisai/imp/global_api.cc @@ -173,7 +173,7 @@ void create_kernel_registry(std::vector domains) { auto def_builder = KernelDefBuilder::Create(); def_builder->SetName(op->GetName(op)); def_builder->SetDomain(domain->domain_.c_str()); - def_builder->SinceVersion(1); + def_builder->SinceVersion(op->GetStartVersion(op), op->GetEndVersion(op)); if (op->version > 12) { auto input_count = op->GetInputTypeCount(op); for (auto i = 0u; i < input_count; i++) { @@ -183,7 +183,7 @@ void create_kernel_registry(std::vector domains) { def_builder->Provider(onnxruntime::kVitisAIExecutionProvider); KernelCreateFn kernel_create_fn = [op](FuncManager&, const OpKernelInfo& info, std::unique_ptr& out) -> Status { - // out = std::make_unique(info, *op); + out = std::make_unique(info, *op); return Status::OK(); }; std::ignore = s_kernel_registry_vitisaiep->Register(KernelCreateInfo(def_builder->Build(), kernel_create_fn)); diff --git a/onnxruntime/core/providers/vitisai/vitisai_execution_provider.cc b/onnxruntime/core/providers/vitisai/vitisai_execution_provider.cc index 0f0972d96bcee..58fef537535d2 100644 --- a/onnxruntime/core/providers/vitisai/vitisai_execution_provider.cc +++ b/onnxruntime/core/providers/vitisai/vitisai_execution_provider.cc @@ -44,7 +44,7 @@ VitisAIExecutionProvider::VitisAIExecutionProvider( void VitisAIExecutionProvider::CreateKernelRegistry() { for (const auto& domain : get_domains_vitisaiep()) { for (const auto* op : domain->custom_ops_) { - vitisai_optypes_.insert(op->GetName(op)); + vitisai_optypes_.insert(domain->domain_ + ":" + op->GetName(op)); } } } From d01fc75ef161a624c4275f89cb068cc1c79d9392 Mon Sep 17 00:00:00 2001 From: Yueqing Zhang Date: Fri, 26 Jul 2024 22:15:57 -0700 Subject: [PATCH 34/57] [VitisAI] support vaip create ep context nodes & bug fix (#21506) ### Description 1. We decided to move the context node creation back to our own repo because it is more flexible to modify. 2. We found a bug related the context node. It would change the inference order. So, we fixed in this PR as well. ### Motivation and Context This is crucial for Microsoft Release next month. --------- Co-authored-by: Yueqing Zhang --- .../shared_library/provider_interfaces.h | 1 + .../shared_library/provider_wrappedtypes.h | 1 + .../core/providers/vitisai/imp/global_api.cc | 50 +++++++++++++++++++ .../vitisai/include/vaip/custom_op.h | 11 ++++ .../vitisai/include/vaip/global_api.h | 6 ++- .../vitisai/include/vaip/vaip_ort_api.h | 11 ++-- .../vitisai/vitisai_execution_provider.cc | 14 ++++-- .../core/session/provider_bridge_ort.cc | 1 + 8 files changed, 88 insertions(+), 7 deletions(-) diff --git a/onnxruntime/core/providers/shared_library/provider_interfaces.h b/onnxruntime/core/providers/shared_library/provider_interfaces.h index 382b3ac932520..a9394838aa784 100644 --- a/onnxruntime/core/providers/shared_library/provider_interfaces.h +++ b/onnxruntime/core/providers/shared_library/provider_interfaces.h @@ -388,6 +388,7 @@ struct ProviderHost { virtual ONNX_NAMESPACE::TensorProto* AttributeProto__add_tensors(ONNX_NAMESPACE::AttributeProto* p) = 0; // GraphProto + virtual std::unique_ptr GraphProto__construct() = 0; virtual void GraphProto__operator_delete(ONNX_NAMESPACE::GraphProto* p) = 0; virtual void GraphProto__operator_assign(ONNX_NAMESPACE::GraphProto* p, const ONNX_NAMESPACE::GraphProto& v) = 0; diff --git a/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h b/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h index de6c1da1d6430..242c7126f3274 100644 --- a/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h +++ b/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h @@ -146,6 +146,7 @@ struct AttributeProto final { }; struct GraphProto final { + static std::unique_ptr Create() { return g_host->GraphProto__construct(); } static void operator delete(void* p) { g_host->GraphProto__operator_delete(reinterpret_cast(p)); } void operator=(const GraphProto& v) { return g_host->GraphProto__operator_assign(this, v); } diff --git a/onnxruntime/core/providers/vitisai/imp/global_api.cc b/onnxruntime/core/providers/vitisai/imp/global_api.cc index a86a4fb61d54d..df47fa5cee4ab 100644 --- a/onnxruntime/core/providers/vitisai/imp/global_api.cc +++ b/onnxruntime/core/providers/vitisai/imp/global_api.cc @@ -55,10 +55,15 @@ struct OrtVitisAIEpAPI { uint32_t (*vaip_get_version)(); void (*get_backend_compilation_cache)(const std::string& model_path, const onnxruntime::Graph& graph, const char* json_config, uint8_t compiler_codes, std::string& cache_dir, std::string& cache_key, std::string& cache_data); void (*restore_backend_compilation_cache)(const std::string& cache_dir, const std::string& cache_key, const std::string& cache_data, const std::string& model_path); + void (*create_ep_context_nodes)( + onnxruntime::Graph& ep_context_graph, + const std::vector>& eps, + vaip_core::DllSafe>* ret_value) = nullptr; void Ensure() { if (handle_) return; auto& env = Provider_GetHost()->Env__Default(); + auto& logger = *Provider_GetHost()->LoggingManager_GetDefaultLogger(); #ifdef _WIN32 // this dll is already linked to the executable, normally a test program handle_ = reinterpret_cast(GetModuleHandle(TEXT("onnxruntime_vitisai_ep.dll"))); @@ -81,6 +86,10 @@ struct OrtVitisAIEpAPI { (void**)&vaip_get_version); ORT_THROW_IF_ERROR(env.GetSymbolFromLibrary(handle_, "get_compilation_cache", (void**)&get_backend_compilation_cache)); ORT_THROW_IF_ERROR(env.GetSymbolFromLibrary(handle_, "restore_compilation_cache", (void**)&restore_backend_compilation_cache)); + status1 = (env.GetSymbolFromLibrary(handle_, "create_ep_context_nodes", (void**)&create_ep_context_nodes)); + if (!status1.IsOK()) { + LOGS(logger, WARNING) << "create_ep_context_nodes is not defined, please upgrade onnxruntime_vitisai_ep.dll. However, it still works."; + } } private: @@ -146,6 +155,24 @@ void restore_backend_compilation_cache(const std::string& cache_dir, const std:: s_library_vitisaiep.restore_backend_compilation_cache(cache_dir, cache_key, cache_data, model_path); } +bool has_create_ep_context_nodes() { + return s_library_vitisaiep.create_ep_context_nodes != nullptr; +} + +std::optional> create_ep_context_nodes( + onnxruntime::Graph& ep_context_graph, + const std::vector>& eps) { + if (s_library_vitisaiep.create_ep_context_nodes) { + vaip_core::DllSafe> nodes; + s_library_vitisaiep.create_ep_context_nodes(ep_context_graph, eps, &nodes); + if (nodes.get()) { + auto ret = std::vector(*nodes); + return ret; + } + } + return std::nullopt; +} + struct MyCustomOpKernel : OpKernel { MyCustomOpKernel(const OpKernelInfo& info, const OrtCustomOp& op) : OpKernel(info), op_(op) { op_kernel_ = @@ -405,6 +432,29 @@ vaip_core::OrtApiForVaip* create_org_api_hook() { graph.AddInitializedTensor(tensor); }; + the_global_api.get_model_path = [](const Graph& graph) -> const std::filesystem::path& { + return graph.ModelPath(); + }; + + the_global_api.create_empty_model = [](const std::filesystem::path& path, const std::vector>& opset) -> Model* { + auto model_proto = ONNX_NAMESPACE::ModelProto::Create(); + auto graph_proto = ONNX_NAMESPACE::GraphProto::Create(); + model_proto->set_ir_version(ONNX_NAMESPACE::Version::IR_VERSION); + for (const auto& op : opset) { + auto* opset_import = model_proto->add_opset_import(); + *(opset_import->mutable_domain()) = op.first; + opset_import->set_version(op.second); + } + std::ignore = model_proto->mutable_graph(); // create a graph + auto& logger = logging::LoggingManager::DefaultLogger(); + auto model = Model::Create(std::move(*model_proto), path, nullptr, logger); + return model.release(); + }; + + the_global_api.graph_set_inputs = [](Graph& graph, gsl::span inputs) { + graph.SetInputs(inputs); + }; + if (!s_library_vitisaiep.vaip_get_version) { return reinterpret_cast(&(the_global_api.host_)); } else { diff --git a/onnxruntime/core/providers/vitisai/include/vaip/custom_op.h b/onnxruntime/core/providers/vitisai/include/vaip/custom_op.h index d34f7095b704d..5d020e00ff5b7 100644 --- a/onnxruntime/core/providers/vitisai/include/vaip/custom_op.h +++ b/onnxruntime/core/providers/vitisai/include/vaip/custom_op.h @@ -26,6 +26,17 @@ class ExecutionProvider { virtual DllSafe> get_meta_def_constant_initializer() const = 0; virtual std::unique_ptr compile() const = 0; + + public: + inline void set_fused_node(const onnxruntime::Node* fused_node) { + fused_node_ = fused_node; + } + inline const onnxruntime::Node* get_fused_node() const { + return fused_node_; + } + + private: + const onnxruntime::Node* fused_node_ = nullptr; }; class CustomOp { diff --git a/onnxruntime/core/providers/vitisai/include/vaip/global_api.h b/onnxruntime/core/providers/vitisai/include/vaip/global_api.h index 3fdbc60bb0ee6..ae2a513a98e32 100644 --- a/onnxruntime/core/providers/vitisai/include/vaip/global_api.h +++ b/onnxruntime/core/providers/vitisai/include/vaip/global_api.h @@ -9,10 +9,14 @@ #include "vaip/my_ort.h" #include "vaip/dll_safe.h" #include "vaip/custom_op.h" - +#include void initialize_vitisai_ep(); vaip_core::DllSafe>> compile_onnx_model(const onnxruntime::GraphViewer& graph_viewer, const onnxruntime::logging::Logger& logger, const onnxruntime::ProviderOptions& options); std::shared_ptr get_kernel_registry_vitisaiep(); const std::vector& get_domains_vitisaiep(); void get_backend_compilation_cache(const onnxruntime::PathString& model_path_str, const onnxruntime::GraphViewer& graph_viewer, const onnxruntime::ProviderOptions& options, uint8_t compiler_codes, std::string& cache_dir, std::string& cache_key, std::string& cache_data); void restore_backend_compilation_cache(const std::string& cache_dir, const std::string& cache_key, const std::string& cache_data, const std::string& model_path); +std::optional> create_ep_context_nodes( + onnxruntime::Graph& ep_context_graph, + const std::vector>& eps); +bool has_create_ep_context_nodes(); diff --git a/onnxruntime/core/providers/vitisai/include/vaip/vaip_ort_api.h b/onnxruntime/core/providers/vitisai/include/vaip/vaip_ort_api.h index 3346739890484..e6aacfe1f0272 100644 --- a/onnxruntime/core/providers/vitisai/include/vaip/vaip_ort_api.h +++ b/onnxruntime/core/providers/vitisai/include/vaip/vaip_ort_api.h @@ -8,12 +8,13 @@ #include #include #include +#include struct OrtApi; namespace vaip_core { -#define VAIP_ORT_API_MAJOR (3u) -#define VAIP_ORT_API_MINOR (1u) +#define VAIP_ORT_API_MAJOR (4u) +#define VAIP_ORT_API_MINOR (0u) #define VAIP_ORT_API_PATCH (0u) struct OrtApiForVaip { uint32_t magic; // 'VAIP' or something else to make sure the following field @@ -222,7 +223,11 @@ struct OrtApiForVaip { const std::vector& data); // [88] TensorProto* (*tensor_proto_new_bf16)( const std::string& name, const std::vector& shape, - const std::vector& data); // [89] + const std::vector& data); // [89] + const std::filesystem::path& (*get_model_path)(const Graph& graph); // [90] + Model* (*create_empty_model)(const std::filesystem::path& path, const std::vector>& opset); //[91] + void (*graph_set_inputs)(Graph& graph, + gsl::span inputs); // [92] }; #ifndef USE_VITISAI diff --git a/onnxruntime/core/providers/vitisai/vitisai_execution_provider.cc b/onnxruntime/core/providers/vitisai/vitisai_execution_provider.cc index 58fef537535d2..756bda2199e89 100644 --- a/onnxruntime/core/providers/vitisai/vitisai_execution_provider.cc +++ b/onnxruntime/core/providers/vitisai/vitisai_execution_provider.cc @@ -58,8 +58,15 @@ const InlinedVector VitisAIExecutionProvider::GetEpContextNodes() c // All preconditions are supposed to have happened. if (p_ep_ctx_model_) { auto& graph = p_ep_ctx_model_->MainGraph(); - for (const auto* p_node : graph.Nodes()) { - ep_context_node_ptrs.push_back(p_node); + if (has_create_ep_context_nodes()) { + auto nodes = create_ep_context_nodes(graph, **execution_providers_); + if (nodes.has_value()) { + ep_context_node_ptrs.assign(nodes->begin(), nodes->end()); + } + } else { + for (const auto* p_node : graph.Nodes()) { + ep_context_node_ptrs.push_back(p_node); + } } } return ep_context_node_ptrs; @@ -187,6 +194,7 @@ common::Status VitisAIExecutionProvider::Compile(const std::vectorexecution_providers_)[index]->set_fused_node(&fused_node_graph.fused_node.get()); compute_info.create_state_func = [this, index](ComputeContext* context, FunctionState* state) { auto* p = (**this->execution_providers_)[index]->compile().release(); *state = p; @@ -204,7 +212,7 @@ common::Status VitisAIExecutionProvider::Compile(const std::vectoradd_tensors(); } // GraphProto (wrapped) + std::unique_ptr GraphProto__construct() override { return std::make_unique(); } void GraphProto__operator_delete(ONNX_NAMESPACE::GraphProto* p) override { delete p; } const ONNX_NAMESPACE::ValueInfoProto& GraphProto__input(const ONNX_NAMESPACE::GraphProto* p, int index) override { return p->input(index); } From 10b4a3b90bd61fcda8aefecf2a1dce1a45c086e1 Mon Sep 17 00:00:00 2001 From: maggie1059 <34173352+maggie1059@users.noreply.github.com> Date: Fri, 26 Jul 2024 22:26:38 -0700 Subject: [PATCH 35/57] Fix conda failure for onnxruntime-directml (#21526) The change in #21005 works for directly building wheels with `build.py`, but ort-nightly-directml wheels, as well as the 1.18.1 release of the onnxruntime-directml python wheel, still do not work with conda since they're built from the `py-win-gpu.yml` pipeline, which uses `install_third_party_deps.ps1` to set compile flags. --- tools/ci_build/github/windows/install_third_party_deps.ps1 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/ci_build/github/windows/install_third_party_deps.ps1 b/tools/ci_build/github/windows/install_third_party_deps.ps1 index 07679006fb343..168df90188791 100644 --- a/tools/ci_build/github/windows/install_third_party_deps.ps1 +++ b/tools/ci_build/github/windows/install_third_party_deps.ps1 @@ -27,7 +27,7 @@ $Env:CMAKE_PREFIX_PATH = "$install_prefix" New-Item -Path "$install_prefix" -ItemType Directory -Force # Setup compile flags -$compile_flags = @('/MP', '/guard:cf', '/DWIN32', '/D_WINDOWS', '/DWINVER=0x0A00', '/D_WIN32_WINNT=0x0A00', '/DNTDDI_VERSION=0x0A000000', '/W3') +$compile_flags = @('/MP', '/guard:cf', '/DWIN32', '/D_WINDOWS', '/D_DISABLE_CONSTEXPR_MUTEX_CONSTRUCTOR', '/DWINVER=0x0A00', '/D_WIN32_WINNT=0x0A00', '/DNTDDI_VERSION=0x0A000000', '/W3') $linker_flags=@('/guard:cf') if ($use_cache) { From 1ce160883f964509a547458c484d2449bda047ae Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Fri, 26 Jul 2024 22:31:16 -0700 Subject: [PATCH 36/57] Bump Sixlabors.ImageSharp from 2.1.8 to 2.1.9 in /csharp/sample/Microsoft.ML.OnnxRuntime.ResNet50v2Sample (#21444) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bumps [Sixlabors.ImageSharp](https://github.com/SixLabors/ImageSharp) from 2.1.8 to 2.1.9.
Release notes

Sourced from Sixlabors.ImageSharp's releases.

v2.1.9

What's Changed

Full Changelog: https://github.com/SixLabors/ImageSharp/compare/v2.1.8...v2.1.9

Commits
  • 9816ca4 Merge pull request #2770 from SixLabors/af/backport-2759-2.1.x
  • b33d666 handle DecodingMode
  • 6b2030b Merge branch 'release/2.1.x' into af/backport-2759-2.1.x
  • 8ffad3f Issue2012BadMinCode should decode now
  • 1f5bf23 skip Issue2758_DecodeWorks
  • 3bf8c57 manual port of 3.1 gif decoder
  • 28c20de Clamp JPEG quality estimation results.
  • 4b910e7 Decode LZW row by row
  • a1f2879 Merge pull request #2756 from SixLabors/af/git-av-2.1
  • 898df7f backport #2749 to 2.1
  • Additional commits viewable in compare view

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=Sixlabors.ImageSharp&package-manager=nuget&previous-version=2.1.8&new-version=2.1.9)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@dependabot rebase` will rebase this PR - `@dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@dependabot merge` will merge this PR after your CI passes on it - `@dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@dependabot cancel merge` will cancel a previously requested merge and block automerging - `@dependabot reopen` will reopen this PR if it is closed - `@dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself) You can disable automated security fix PRs for this repo from the [Security Alerts page](https://github.com/microsoft/onnxruntime/network/alerts).
Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- .../Microsoft.ML.OnnxRuntime.ResNet50v2Sample.csproj | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/csharp/sample/Microsoft.ML.OnnxRuntime.ResNet50v2Sample/Microsoft.ML.OnnxRuntime.ResNet50v2Sample.csproj b/csharp/sample/Microsoft.ML.OnnxRuntime.ResNet50v2Sample/Microsoft.ML.OnnxRuntime.ResNet50v2Sample.csproj index 647c0bbe6a242..29fc9f3bc382f 100644 --- a/csharp/sample/Microsoft.ML.OnnxRuntime.ResNet50v2Sample/Microsoft.ML.OnnxRuntime.ResNet50v2Sample.csproj +++ b/csharp/sample/Microsoft.ML.OnnxRuntime.ResNet50v2Sample/Microsoft.ML.OnnxRuntime.ResNet50v2Sample.csproj @@ -8,7 +8,7 @@ - + From 48fb8a7e56a7263a8405dc644756eb5c55560352 Mon Sep 17 00:00:00 2001 From: jingyanwangms <47403504+jingyanwangms@users.noreply.github.com> Date: Sat, 27 Jul 2024 11:10:52 -0700 Subject: [PATCH 37/57] Security fuzz address sanitizer fix Bug #2 and #3 (#21528) ### Description Security fuzz test with address sanitizer found several bugs --- onnxruntime/contrib_ops/cpu/transformers/subgraph_gpt.cc | 2 ++ onnxruntime/core/optimizer/attention_fusion.cc | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/onnxruntime/contrib_ops/cpu/transformers/subgraph_gpt.cc b/onnxruntime/contrib_ops/cpu/transformers/subgraph_gpt.cc index 34a1da99316a2..030cdb1e1b17f 100644 --- a/onnxruntime/contrib_ops/cpu/transformers/subgraph_gpt.cc +++ b/onnxruntime/contrib_ops/cpu/transformers/subgraph_gpt.cc @@ -143,6 +143,8 @@ Status GptSubgraph::Validate(const std::vector& subgraph_inputs, // Past state shape is like (2, batch_size, num_heads, past_seq_len, hidden_size/num_heads). const ONNX_NAMESPACE::TensorShapeProto* past_shape = subgraph_inputs[3]->Shape(); + ORT_RETURN_IF(past_shape == nullptr, + "subgraph past state cannot be nullptr"); ORT_RETURN_IF(past_shape->dim_size() != 5, "subgraph past state is expected to have 5 dimension, got ", past_shape->dim_size()); diff --git a/onnxruntime/core/optimizer/attention_fusion.cc b/onnxruntime/core/optimizer/attention_fusion.cc index 08066f030a381..64a38214caff0 100644 --- a/onnxruntime/core/optimizer/attention_fusion.cc +++ b/onnxruntime/core/optimizer/attention_fusion.cc @@ -210,7 +210,7 @@ Status AttentionFusion::ApplyImpl(Graph& graph, bool& modified, int graph_level, if ((node.GetOutputEdgesCount() >= 2 && node.GetOutputEdgesCount() <= 6) && // Add node.GetOutputEdgesCount() == 5/6 for distilbert graph_utils::IsSupportedOptypeVersionAndDomain(node, "LayerNormalization", {1, 17}, kOnnxDomain) && - graph_utils::IsSupportedProvider(node, GetCompatibleExecutionProviders())) { + graph_utils::IsSupportedProvider(node, GetCompatibleExecutionProviders()) && node.InputDefs().size() > 2) { // Get hidden size from layer norm bias tensor shape. const NodeArg& layer_norm_bias = *(node.InputDefs()[2]); if (!optimizer_utils::IsShapeKnownOnAllDims(layer_norm_bias, 1)) { From 82b2955268e14f26eb71ad2d660452ab8db454d7 Mon Sep 17 00:00:00 2001 From: Ranjit Ranjan <165394499+ranjitshs@users.noreply.github.com> Date: Sat, 27 Jul 2024 23:47:22 +0530 Subject: [PATCH 38/57] [AIX]test failure fix using gtest-1.15.0 for AIX (#21497) ### Description Local CI setup for AIX reported tests failure after the gtest 1.15.0 upgrade. ### Motivation and Context Below tests failure is observed after gtest upgrade. The following tests FAILED: 1 - onnxruntime_test_all (ILLEGAL) 7 - onnxruntime_logging_apis_test (Subprocess aborted) To fix this, I am enabling pthread support under gtest. This was disabled with previous version of gtest for some reason. Now by enabling this, above tests are getting passed with gtest 1.15.0. --- cmake/external/onnxruntime_external_deps.cmake | 3 --- 1 file changed, 3 deletions(-) diff --git a/cmake/external/onnxruntime_external_deps.cmake b/cmake/external/onnxruntime_external_deps.cmake index 14e6ed515fd6e..775576a771529 100644 --- a/cmake/external/onnxruntime_external_deps.cmake +++ b/cmake/external/onnxruntime_external_deps.cmake @@ -46,9 +46,6 @@ if (onnxruntime_BUILD_UNIT_TESTS) if (CMAKE_SYSTEM_NAME STREQUAL "Emscripten") set(gtest_disable_pthreads ON) endif() - if (${CMAKE_SYSTEM_NAME} MATCHES "AIX") - set(gtest_disable_pthreads ON CACHE BOOL "gtest_disable_pthreads" FORCE) - endif() set(INSTALL_GTEST OFF CACHE BOOL "" FORCE) if (IOS OR ANDROID) # on mobile platforms the absl flags class dumps the flag names (assumably for binary size), which breaks passing From 7e23212de9746ed2452061958f8aae3ffc171cee Mon Sep 17 00:00:00 2001 From: Jian Chen Date: Sat, 27 Jul 2024 15:58:12 -0700 Subject: [PATCH 39/57] Delete tools/ci_build/github/azure-pipelines/win-gpu-ci-pipeline.yml (#21529) ### Description Delete tools/ci_build/github/azure-pipelines/win-gpu-ci-pipeline.yml ### Motivation and Context This CI pipeline has been divided into 4 different pipeline. --- .../azure-pipelines/win-gpu-ci-pipeline.yml | 125 ------------------ 1 file changed, 125 deletions(-) delete mode 100644 tools/ci_build/github/azure-pipelines/win-gpu-ci-pipeline.yml diff --git a/tools/ci_build/github/azure-pipelines/win-gpu-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/win-gpu-ci-pipeline.yml deleted file mode 100644 index c5262880c4c55..0000000000000 --- a/tools/ci_build/github/azure-pipelines/win-gpu-ci-pipeline.yml +++ /dev/null @@ -1,125 +0,0 @@ -##### start trigger Don't edit it manually, Please do edit set-trigger-rules.py #### -trigger: - branches: - include: - - main - - rel-* - paths: - exclude: - - docs/** - - README.md - - CONTRIBUTING.md - - BUILD.md - - 'js/web' - - 'onnxruntime/core/providers/js' -pr: - branches: - include: - - main - - rel-* - paths: - exclude: - - docs/** - - README.md - - CONTRIBUTING.md - - BUILD.md - - 'js/web' - - 'onnxruntime/core/providers/js' -#### end trigger #### - -parameters: -- name: CudaVersion - displayName: CUDA version - type: string - default: '12.2' - values: - - 11.8 - - 12.2 -- name: RunOnnxRuntimeTests - displayName: Run Tests? - type: boolean - default: true - -stages: -- stage: cuda - dependsOn: [] - jobs: - - template: templates/jobs/win-ci-vs-2022-job.yml - parameters: - BuildConfig: 'RelWithDebInfo' - EnvSetupScript: setup_env_cuda.bat - buildArch: x64 - additionalBuildFlags: >- - --enable_pybind --build_java --build_nodejs --use_cuda --cuda_home="$(Agent.TempDirectory)\v${{ parameters.CudaVersion }}" - --enable_cuda_profiling --enable_transformers_tool_test - --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86 - --cmake_extra_defines onnxruntime_BUILD_UNIT_TESTS=ON - --cmake_extra_defines onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS=ON - msbuildPlatform: x64 - isX86: false - job_name_suffix: x64_RelWithDebInfo - RunOnnxRuntimeTests: ${{ parameters.RunOnnxRuntimeTests }} - ORT_EP_NAME: CUDA - WITH_CACHE: true - MachinePool: onnxruntime-Win2022-GPU-A10 - -- stage: training - dependsOn: [] - jobs: - - template: templates/jobs/win-ci-vs-2022-job.yml - parameters: - BuildConfig: 'RelWithDebInfo' - EnvSetupScript: setup_env_cuda.bat - buildArch: x64 - additionalBuildFlags: >- - --enable_pybind --enable_training --use_cuda --cuda_home="$(Agent.TempDirectory)\v${{ parameters.CudaVersion }}" - --skip_onnx_tests - --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86 - msbuildPlatform: x64 - isX86: false - job_name_suffix: x64_RelWithDebInfo - RunOnnxRuntimeTests: ${{ parameters.RunOnnxRuntimeTests }} - ORT_EP_NAME: CUDA - WITH_CACHE: true - MachinePool: onnxruntime-Win2022-GPU-A10 - isTraining: true - -- stage: dml - dependsOn: [] - jobs: - - template: templates/jobs/win-ci-vs-2022-job.yml - parameters: - BuildConfig: 'RelWithDebInfo' - EnvSetupScript: setup_env.bat - buildArch: x64 - additionalBuildFlags: --enable_pybind --use_dml --enable_wcos --use_winml - msbuildPlatform: x64 - isX86: false - job_name_suffix: x64_RelWithDebInfo - RunOnnxRuntimeTests: ${{ parameters.RunOnnxRuntimeTests }} - ORT_EP_NAME: DML - WITH_CACHE: false - MachinePool: onnxruntime-Win2022-GPU-dml-A10 - -- stage: kernelDocumentation - dependsOn: [] - jobs: - - template: templates/jobs/win-ci-vs-2022-job.yml - parameters: - BuildConfig: 'RelWithDebInfo' - EnvSetupScript: setup_env_cuda.bat - buildArch: x64 - # note: need to specify `--gen_doc` when creating the build config so it has to be in additionalBuildFlags - additionalBuildFlags: >- - --gen_doc validate --skip_tests --enable_pybind --use_dml --use_cuda - --cuda_home="$(Agent.TempDirectory)\v${{ parameters.CudaVersion }}" - --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86 - --cmake_extra_defines onnxruntime_BUILD_UNIT_TESTS=OFF - msbuildPlatform: x64 - isX86: false - job_name_suffix: x64_RelWithDebInfo - RunOnnxRuntimeTests: false - GenerateDocumentation: true - ORT_EP_NAME: CUDA # It doesn't really matter which EP is selected here since this stage is for documentation. - WITH_CACHE: true - MachinePool: onnxruntime-Win2022-GPU-A10 From a4d3a1ce0c18e1d1b31a9cc0b45beba290ee114c Mon Sep 17 00:00:00 2001 From: liqun Fu Date: Sat, 27 Jul 2024 15:58:36 -0700 Subject: [PATCH 40/57] pick changes from https://github.com/onnx/onnx/pull/6195 to fix heap-buffer-overflow in onnx::convPoolShapeInference (#21507) ### Description onnx 1.16.2 is not available before ort 1.19.0 code freeze. Thus pick the needed change as patch --- cmake/patches/onnx/onnx.patch | 383 ++++++++++++++++++ .../providers/cpu/generator/random_test.cc | 8 +- .../core/graph/training_op_defs.cc | 104 +++-- 3 files changed, 447 insertions(+), 48 deletions(-) diff --git a/cmake/patches/onnx/onnx.patch b/cmake/patches/onnx/onnx.patch index 162d33581a5ca..6ac3555eeecf1 100644 --- a/cmake/patches/onnx/onnx.patch +++ b/cmake/patches/onnx/onnx.patch @@ -86,3 +86,386 @@ index 0aab3e26..398ac2d6 100644 +#endif + #endif // ! ONNX_ONNX_PB_H +diff --git a/onnx/defs/math/defs.cc b/onnx/defs/math/defs.cc +index c315a2a7..58963154 100644 +--- a/onnx/defs/math/defs.cc ++++ b/onnx/defs/math/defs.cc +@@ -3472,6 +3472,9 @@ ONNX_OPERATOR_SET_SCHEMA( + } + + auto& input_shape = getInputShape(ctx, 0); ++ if (input_shape.dim_size() < 2) { ++ fail_shape_inference("First input should have at least 2 dimensions in ", ctx.getDisplayName(), "."); ++ } + auto signal_dim = input_shape.dim(1); + if (!signal_dim.has_dim_value()) { + return; +diff --git a/onnx/defs/nn/defs.cc b/onnx/defs/nn/defs.cc +index be6a851d..fad595d0 100644 +--- a/onnx/defs/nn/defs.cc ++++ b/onnx/defs/nn/defs.cc +@@ -126,6 +126,9 @@ void convPoolShapeInference( + residual -= stride; + } + } ++ if (i >= static_cast(effective_kernel_shape.size())) { ++ fail_shape_inference("kernel shape should have ", input_dims_size, " values in ", ctx.getDisplayName(), "."); ++ } + int64_t total_pad = residual == 0 ? effective_kernel_shape[i] - stride : effective_kernel_shape[i] - residual; + if (total_pad < 0) + total_pad = 0; +@@ -959,19 +962,21 @@ ONNX_OPERATOR_SET_SCHEMA( + auto w_type = ctx.getInputType(3); + if (nullptr == x_type || nullptr == w_type || x_type->value_case() != TypeProto::kTensorType || + w_type->value_case() != TypeProto::kTensorType) { +- fail_type_inference("inputs are expected to have tensor type."); ++ fail_type_inference("inputs are expected to have tensor type in ", ctx.getDisplayName(), "."); + } + + auto x_zero_point_type = ctx.getInputType(2); + if (nullptr == x_zero_point_type || + x_zero_point_type->tensor_type().elem_type() != x_type->tensor_type().elem_type()) { +- fail_type_inference("input and zero_point pair is expected to have be same type."); ++ fail_type_inference( ++ "input and zero_point pair is expected to have be same type in ", ctx.getDisplayName(), "."); + } + + auto w_zero_point_type = ctx.getInputType(5); + if (nullptr == w_zero_point_type || + w_zero_point_type->tensor_type().elem_type() != w_type->tensor_type().elem_type()) { +- fail_type_inference("weight and zero_point pair is expected to have same type."); ++ fail_type_inference( ++ "weight and zero_point pair is expected to have same type in ", ctx.getDisplayName(), "."); + } + + propagateElemTypeFromInputToOutput(ctx, 7, 0); +@@ -2647,7 +2652,8 @@ ONNX_OPERATOR_SET_SCHEMA( + if (!hasNInputShapes(ctx, 1)) { + return; + } +- auto& input_shape = ctx.getInputType(0)->tensor_type().shape(); ++ ++ auto& input_shape = getInputShape(ctx, 0); + int64_t input_ndim = input_shape.dim_size(); + int64_t axis = -1; + auto axis_proto = ctx.getAttribute("axis"); +@@ -2659,7 +2665,16 @@ ONNX_OPERATOR_SET_SCHEMA( + // positive value. + axis += input_ndim; + } +- ++ if (axis < 0) { ++ fail_shape_inference( ++ "Unexpected axis value (", ++ axis, ++ ") rank of first input is ", ++ input_ndim, ++ " in ", ++ ctx.getDisplayName(), ++ "."); ++ } + if (ctx.getNumOutputs() > 1) { + auto mean_shape = ctx.getOutputType(1)->mutable_tensor_type()->mutable_shape(); + mean_shape->CopyFrom(input_shape); +diff --git a/onnx/defs/nn/old.cc b/onnx/defs/nn/old.cc +index 57f8e2a4..8b2dc07f 100644 +--- a/onnx/defs/nn/old.cc ++++ b/onnx/defs/nn/old.cc +@@ -201,6 +201,9 @@ void convPoolShapeInference_opset19( + residual -= stride; + } + } ++ if (i >= static_cast(effective_kernel_shape.size())) { ++ fail_shape_inference("kernel shape should have ", input_dims_size, " values in ", ctx.getDisplayName(), "."); ++ } + int64_t total_pad = residual == 0 ? effective_kernel_shape[i] - stride : effective_kernel_shape[i] - residual; + if (total_pad < 0) + total_pad = 0; +diff --git a/onnx/defs/shape_inference.h b/onnx/defs/shape_inference.h +index a80473b3..d1bcd401 100644 +--- a/onnx/defs/shape_inference.h ++++ b/onnx/defs/shape_inference.h +@@ -105,6 +105,10 @@ struct InferenceContext { + virtual const SparseTensorProto* getInputSparseData(size_t index) const = 0; + // Gets the shape inputs computed by partial data propagation. + virtual const TensorShapeProto* getSymbolicInput(size_t index) const = 0; ++ // To display a name the user can use to narrow its search. ++ virtual std::string getDisplayName() const { ++ return ""; ++ } + }; + + // We use data propagation to perform partial evaluation of the model, to compute statically +@@ -263,7 +267,15 @@ inline void propagateElemTypeFromDtypeToOutput( + } else { + // This is not expected to happen + fail_type_inference( +- "Output ", outputIndex, " expected to have: ", expected_value_case, " or UNDEFINED. Got: ", output_value_case); ++ "Output ", ++ outputIndex, ++ " expected to have: ", ++ expected_value_case, ++ " or UNDEFINED. Got: ", ++ output_value_case, ++ " in ", ++ ctx.getDisplayName(), ++ "."); + } + } + +@@ -277,18 +289,18 @@ inline void propagateElemTypeFromDtypeToOutput(InferenceContext& ctx, const Attr + const auto attr_type = attr->type(); + if (attr_type == AttributeProto::TENSOR) { + if (attr->t().dims().size() != 1) { +- fail_type_inference("Attribute expected to have a one-dim tensor"); ++ fail_type_inference("Attribute expected to have a one-dim tensor in ", ctx.getDisplayName(), "."); + } + data_type = attr->t().data_type(); + expected_value_case = TypeProto::kTensorType; + } else if (attr_type == AttributeProto::SPARSE_TENSOR) { + if (attr->sparse_tensor().dims().size() != 1) { +- fail_type_inference("Attribute expected to have a one-dim sparse tensor"); ++ fail_type_inference("Attribute expected to have a one-dim sparse tensor in ", ctx.getDisplayName(), "."); + } + data_type = attr->sparse_tensor().values().data_type(); + expected_value_case = TypeProto::kSparseTensorType; + } else { +- fail_type_inference("Attribute expected to have tensor or sparse tensor type"); ++ fail_type_inference("Attribute expected to have tensor or sparse tensor type in ", ctx.getDisplayName(), "."); + } + + propagateElemTypeFromDtypeToOutput(ctx, data_type, outputIndex, expected_value_case); +@@ -326,7 +338,10 @@ inline const TensorShapeProto& getInputShape(const InferenceContext& ctx, size_t + const auto* input_type = ctx.getInputType(n); + const auto value_case = input_type->value_case(); + if (value_case != TypeProto::kTensorType && value_case != TypeProto::kSparseTensorType) { +- fail_type_inference("Attribute expected to have tensor or sparse tensor type"); ++ fail_type_inference("Input ", n, "expected to be a tensor or a sparse tensor type in ", ctx.getDisplayName(), "."); ++ } ++ if (!hasShape(*input_type)) { ++ fail_shape_inference("Input ", n, " must have a non null shape in ", ctx.getDisplayName(), "."); + } + if (value_case == TypeProto::kTensorType) { + return input_type->tensor_type().shape(); +@@ -344,7 +359,7 @@ inline const TensorShapeProto* getOptionalInputShape(InferenceContext& ctx, size + + const auto value_case = input_type->value_case(); + if (value_case != TypeProto::kTensorType && value_case != TypeProto::kSparseTensorType) { +- fail_type_inference("Attribute expected to have tensor or sparse tensor type"); ++ fail_type_inference("Input ", n, "expected to be a tensor or a sparse tensor type in ", ctx.getDisplayName(), "."); + } + if (value_case == TypeProto::kTensorType) { + return &input_type->tensor_type().shape(); +@@ -372,7 +387,10 @@ inline void appendSingleDimCopiedFromInputTypeToOutputType( + " does not match type of output: ", + outputIndex, + "type: ", +- output_value_case); ++ output_value_case, ++ " in ", ++ ctx.getDisplayName(), ++ "."); + } + if (TypeProto::kTensorType == input_value_case) { + auto* dim = output_type->mutable_tensor_type()->mutable_shape()->add_dim(); +@@ -382,7 +400,13 @@ inline void appendSingleDimCopiedFromInputTypeToOutputType( + *dim = input_type->sparse_tensor_type().shape().dim(static_cast(fromDimIndex)); + } else { + fail_type_inference( +- "Input ", inputIndex, " and Output ", outputIndex, " expected to have tensor or sparse tensor type"); ++ "Input ", ++ inputIndex, ++ " and Output ", ++ outputIndex, ++ " expected to have tensor or sparse tensor type in ", ++ ctx.getDisplayName(), ++ "."); + } + } + +@@ -440,7 +464,14 @@ updateOutputElemType(InferenceContext& ctx, size_t outputIndex, int32_t elemType + setTensorElementType(elemType, expected_type, *output_type); + } else { + // This is not expected to happen +- fail_type_inference("Output ", outputIndex, " expected to have tensor or sparse tensor type: ", expected_type); ++ fail_type_inference( ++ "Output ", ++ outputIndex, ++ " expected to have tensor or sparse tensor type: ", ++ expected_type, ++ " in ", ++ ctx.getDisplayName(), ++ "."); + } + } + +@@ -462,16 +493,17 @@ inline void propagateElemTypeFromAttributeToOutput( + updateOutputElemType(ctx, outputIndex, default_value, expected_type); + return; + } else { +- fail_type_inference("Value of attribute ", attributeName, " not specified"); ++ fail_type_inference("Value of attribute ", attributeName, " not specified in ", ctx.getDisplayName(), "."); + } + } + if (!attr_proto->has_i()) { +- fail_type_inference("Attribute ", attributeName, " should be of integer type and specify a type."); ++ fail_type_inference( ++ "Attribute ", attributeName, " should be of integer type and specify a type in ", ctx.getDisplayName(), "."); + } + auto attr_value = attr_proto->i(); + auto elem_type = static_cast(attr_value); + if (!TensorProto_DataType_IsValid(elem_type)) { +- fail_type_inference("Attribute ", attributeName, " does not specify a valid type."); ++ fail_type_inference("Attribute ", attributeName, " does not specify a valid type in ", ctx.getDisplayName(), "."); + } + updateOutputElemType(ctx, outputIndex, elem_type, expected_type); + } +@@ -497,7 +529,7 @@ inline TensorShapeProto* + getOutputShape(InferenceContext& ctx, size_t n, TypeProto::ValueCase default_type = TypeProto::kTensorType) { + auto output_type = ctx.getOutputType(n); + if (output_type == nullptr) { +- fail_type_inference("Output ", n, " expected to have tensor or sparse type"); ++ fail_type_inference("Output ", n, " expected to have tensor or sparse type in ", ctx.getDisplayName(), "."); + } + const auto output_value_case = output_type->value_case(); + if (output_value_case == TypeProto::kTensorType || output_value_case == TypeProto::kSparseTensorType) { +@@ -505,7 +537,7 @@ getOutputShape(InferenceContext& ctx, size_t n, TypeProto::ValueCase default_typ + } else if (output_value_case == TypeProto::VALUE_NOT_SET) { + return getTensorMutableShape(default_type, *output_type); + } else { +- fail_type_inference("Output ", n, " expected to have tensor type"); ++ fail_type_inference("Output ", n, " expected to have tensor type in ", ctx.getDisplayName(), "."); + } + } + +@@ -562,13 +594,13 @@ inline void propagateShapeFromAttributeToOutput( + auto attr_proto = ctx.getAttribute(attributeName); + if ((nullptr == attr_proto) || (!attr_proto->has_type()) || + (attr_proto->type() != AttributeProto_AttributeType_INTS)) { +- fail_shape_inference("Attribute ", attributeName, " should specify a shape"); ++ fail_shape_inference("Attribute ", attributeName, " should specify a shape in ", ctx.getDisplayName(), "."); + } + auto& int_list = attr_proto->ints(); + TensorShapeProto shape; + for (auto dim_size : int_list) { + if (dim_size < 0) { +- fail_shape_inference("Negative values are not allowed in a shape specification"); ++ fail_shape_inference("Negative values are not allowed in a shape specification in ", ctx.getDisplayName(), "."); + } + shape.add_dim()->set_dim_value(dim_size); + } +@@ -745,7 +777,16 @@ inline void checkInputRank(InferenceContext& ctx, size_t input_index, int expect + if (hasInputShape(ctx, input_index)) { + auto rank = getInputShape(ctx, input_index).dim_size(); + if (rank != expected_rank) { +- fail_shape_inference("Input ", input_index, " expected to have rank ", expected_rank, " but has rank ", rank); ++ fail_shape_inference( ++ "Input ", ++ input_index, ++ " expected to have rank ", ++ expected_rank, ++ " but has rank ", ++ rank, ++ " in ", ++ ctx.getDisplayName(), ++ "."); + } + } + } +@@ -798,7 +839,15 @@ inline void unifyInputDim(InferenceContext& ctx, size_t input_index, int dim_ind + // This shape is expected to have rank > dim_index: + if (input_shape.dim_size() <= dim_index) { + fail_shape_inference( +- "Input ", input_index, " expected to have rank >", dim_index, " but has rank ", input_shape.dim_size()); ++ "Input ", ++ input_index, ++ " expected to have rank >", ++ dim_index, ++ " but has rank ", ++ input_shape.dim_size(), ++ " in ", ++ ctx.getDisplayName(), ++ "."); + } + const Dim& input_dim = input_shape.dim(dim_index); + // Now, unify dim and input_dim: +diff --git a/onnx/shape_inference/implementation.cc b/onnx/shape_inference/implementation.cc +index 8723dcd4..8249fc59 100644 +--- a/onnx/shape_inference/implementation.cc ++++ b/onnx/shape_inference/implementation.cc +@@ -906,7 +906,7 @@ struct FunctionInferenceContext : public InferenceContext { + const std::vector& input_types, + const std::vector& attributes, + const ShapeInferenceOptions& options) +- : input_types_(input_types), options_(options) { ++ : input_types_(input_types), options_(options), func_proto_(&func_proto) { + for (const auto& attr : attributes) { + attributesByName_[attr.name()] = &attr; + } +@@ -971,11 +971,25 @@ struct FunctionInferenceContext : public InferenceContext { + return std::move(output_types_); + } + ++ std::string getDisplayName() const override { ++ if (func_proto_ == nullptr) ++ return ""; ++ if (func_proto_->domain().empty()) { ++ if (func_proto_->name().empty()) ++ return ""; ++ return MakeString("function ", func_proto_->name()); ++ } ++ if (func_proto_->name().empty()) ++ return MakeString("function [", func_proto_->domain(), "]"); ++ return MakeString("function ", func_proto_->name(), "[", func_proto_->domain(), "]"); ++ } ++ + private: + const std::vector& input_types_; + std::vector output_types_; + std::unordered_map attributesByName_; + ShapeInferenceOptions options_; ++ const FunctionProto* func_proto_; + }; + + std::vector InferFunctionOutputTypes( +diff --git a/onnx/shape_inference/implementation.h b/onnx/shape_inference/implementation.h +index 2c63c910..b0e4c32d 100644 +--- a/onnx/shape_inference/implementation.h ++++ b/onnx/shape_inference/implementation.h +@@ -146,7 +146,7 @@ struct InferenceContextImpl : public InferenceContext { + const ShapeInferenceOptions& options, + DataValueMap* generatedShapeData = nullptr, + GraphInferenceContext* graphInferenceContext = nullptr) +- : graphInferenceContext_{graphInferenceContext}, options_(options) { ++ : graphInferenceContext_{graphInferenceContext}, options_(options), node_(&n) { + for (auto& attr : *n.mutable_attribute()) { + attributesByName_[attr.name()] = &attr; + if (attr.has_g()) { +@@ -277,6 +277,19 @@ struct InferenceContextImpl : public InferenceContext { + return inferencer; + } + ++ std::string getDisplayName() const override { ++ if (node_ == nullptr) ++ return ""; ++ if (node_->domain().empty()) { ++ if (node_->name().empty()) ++ return MakeString("node ", node_->op_type()); ++ return MakeString("node ", node_->op_type(), " (", node_->name(), ")"); ++ } ++ if (node_->name().empty()) ++ return MakeString("node ", node_->op_type(), "[", node_->domain(), "]"); ++ return MakeString("node ", node_->op_type(), "[", node_->domain(), "]", " (", node_->name(), ")"); ++ } ++ + std::vector allInputData_; + std::vector allInputSparseData_; + std::vector allShapeInputData_; +@@ -289,6 +302,7 @@ struct InferenceContextImpl : public InferenceContext { + // mutable as internal cache of GraphInferencer instances + mutable std::unordered_map> graphAttributeInferencers_; + ShapeInferenceOptions options_; ++ NodeProto* node_; + }; + + struct DataPropagationContextImpl : public DataPropagationContext { diff --git a/onnxruntime/test/providers/cpu/generator/random_test.cc b/onnxruntime/test/providers/cpu/generator/random_test.cc index ec9b1614488a7..f42f32d63d1fa 100644 --- a/onnxruntime/test/providers/cpu/generator/random_test.cc +++ b/onnxruntime/test/providers/cpu/generator/random_test.cc @@ -178,7 +178,7 @@ TEST(Random, InvalidDType) { test.AddAttribute("shape", dims); test.AddOutput("Y", dims, expected_output); - test.Run(OpTester::ExpectResult::kExpectFailure, "Attribute dtype does not specify a valid type."); + test.Run(OpTester::ExpectResult::kExpectFailure, "Node (node1) Op (RandomNormal) [TypeInferenceError] Attribute dtype does not specify a valid type in ."); } { @@ -194,7 +194,7 @@ TEST(Random, InvalidDType) { test.AddAttribute("shape", dims); test.AddOutput("Y", dims, expected_output); - test.Run(OpTester::ExpectResult::kExpectFailure, "Attribute dtype does not specify a valid type."); + test.Run(OpTester::ExpectResult::kExpectFailure, "Node (node1) Op (RandomUniform) [TypeInferenceError] Attribute dtype does not specify a valid type in ."); } { @@ -210,7 +210,7 @@ TEST(Random, InvalidDType) { test.AddInput("X", dims, input); test.AddOutput("Y", dims, expected_output); - test.Run(OpTester::ExpectResult::kExpectFailure, "Attribute dtype does not specify a valid type."); + test.Run(OpTester::ExpectResult::kExpectFailure, "Node (node1) Op (RandomNormalLike) [TypeInferenceError] Attribute dtype does not specify a valid type in ."); } { @@ -226,7 +226,7 @@ TEST(Random, InvalidDType) { test.AddInput("X", dims, input); test.AddOutput("Y", dims, expected_output); - test.Run(OpTester::ExpectResult::kExpectFailure, "Attribute dtype does not specify a valid type."); + test.Run(OpTester::ExpectResult::kExpectFailure, "Node (node1) Op (RandomUniformLike) [TypeInferenceError] Attribute dtype does not specify a valid type in ."); } } diff --git a/orttraining/orttraining/core/graph/training_op_defs.cc b/orttraining/orttraining/core/graph/training_op_defs.cc index 2a8d2de982e79..92f803030ada4 100644 --- a/orttraining/orttraining/core/graph/training_op_defs.cc +++ b/orttraining/orttraining/core/graph/training_op_defs.cc @@ -181,6 +181,64 @@ static void propagateRecvOutputTensorElemTypes( } } +void SendShapeInfer(ONNX_NAMESPACE::InferenceContext& ctx) { + if (ctx.getNumInputs() < 3) { + fail_shape_inference("Send must have at least three inputs."); + } else { + if (hasInputShape(ctx, 0)) { + auto& signal_input_shape = getInputShape(ctx, 0); + if (static_cast(signal_input_shape.dim_size()) != 0) { + fail_shape_inference("InputSignal of Send must be a scalar."); + } + } + if (hasInputShape(ctx, 1)) { + auto& remote_input_shape = getInputShape(ctx, 1); + if (static_cast(remote_input_shape.dim_size()) != 0) { + fail_shape_inference("Remote of Send must be a scalar."); + } + } + + checkSendInputTensorElemTypes(ctx, "element_types", ctx.getNumInputs() - 2); + } + + if (ctx.getNumOutputs() != 1) { + fail_shape_inference("Send must have one output."); + } + + auto output_element_type = ctx.getOutputType(0)->mutable_tensor_type(); + output_element_type->set_elem_type(TensorProto::BOOL); + ONNX_NAMESPACE::TensorShapeProto output_shape; + updateOutputShape(ctx, 0, {}); + updateOutputElemType(ctx, 0, ONNX_NAMESPACE::TensorProto::BOOL); +} + +void RecvShapeInfer(ONNX_NAMESPACE::InferenceContext& ctx) { + if (ctx.getNumInputs() != 2) { + fail_shape_inference("Recv must have two inputs."); + } else { + if (hasInputShape(ctx, 0)) { + auto& signal_input_shape = getInputShape(ctx, 0); + if (static_cast(signal_input_shape.dim_size()) != 0) { + fail_shape_inference("InputSignal of Recv must be a scalar."); + } + } + if (hasInputShape(ctx, 1)) { + auto& remote_input_shape = getInputShape(ctx, 1); + if (static_cast(remote_input_shape.dim_size()) != 0) { + fail_shape_inference("Remote of Recv must be a scalar."); + } + } + } + + if (ctx.getNumOutputs() < 2) { + fail_shape_inference("Recv must have at least two outputs."); + } + + updateOutputShape(ctx, 0, {}); + updateOutputElemType(ctx, 0, ONNX_NAMESPACE::TensorProto::BOOL); + propagateRecvOutputTensorElemTypes(ctx, "element_types", ctx.getNumOutputs() - 1); +} + TensorProto ToDimensionOneFloatTensor(float value) { auto t = ToTensor(std::vector({value})); t.add_dims(1); @@ -3388,30 +3446,7 @@ Return true if all elements are true and false otherwise. "Constrain types to boolean tensors.") .TypeConstraint("V", OpSchema::all_tensor_types(), "All Tensor types") .TypeAndShapeInferenceFunction([](ONNX_NAMESPACE::InferenceContext& ctx) { - if (ctx.getNumInputs() < 3) { - fail_shape_inference("Send must have at least three inputs."); - } else { - auto& signal_input_shape = getInputShape(ctx, 0); - if (static_cast(signal_input_shape.dim_size()) != 0) { - fail_shape_inference("InputSignal of Send must be a scalar."); - } - auto& remote_input_shape = getInputShape(ctx, 1); - if (static_cast(remote_input_shape.dim_size()) != 0) { - fail_shape_inference("Remote of Send must be a scalar."); - } - - checkSendInputTensorElemTypes(ctx, "element_types", ctx.getNumInputs() - 2); - } - - if (ctx.getNumOutputs() != 1) { - fail_shape_inference("Send must have one output."); - } - - auto output_element_type = ctx.getOutputType(0)->mutable_tensor_type(); - output_element_type->set_elem_type(TensorProto::BOOL); - ONNX_NAMESPACE::TensorShapeProto output_shape; - updateOutputShape(ctx, 0, {}); - updateOutputElemType(ctx, 0, ONNX_NAMESPACE::TensorProto::BOOL); + SendShapeInfer(ctx); }); ONNX_CONTRIB_OPERATOR_SCHEMA(Recv) @@ -3437,26 +3472,7 @@ Return true if all elements are true and false otherwise. "Constrain types to boolean tensors.") .TypeConstraint("V", OpSchema::all_tensor_types(), "All Tensor types") .TypeAndShapeInferenceFunction([](ONNX_NAMESPACE::InferenceContext& ctx) { - if (ctx.getNumInputs() != 2) { - fail_shape_inference("Recv must have two inputs."); - } else { - auto& signal_input_shape = getInputShape(ctx, 0); - if (static_cast(signal_input_shape.dim_size()) != 0) { - fail_shape_inference("InputSignal of Recv must be a scalar."); - } - auto& remote_input_shape = getInputShape(ctx, 1); - if (static_cast(remote_input_shape.dim_size()) != 0) { - fail_shape_inference("Remote of Recv must be a scalar."); - } - } - - if (ctx.getNumOutputs() < 2) { - fail_shape_inference("Recv must have at least two outputs."); - } - - updateOutputShape(ctx, 0, {}); - updateOutputElemType(ctx, 0, ONNX_NAMESPACE::TensorProto::BOOL); - propagateRecvOutputTensorElemTypes(ctx, "element_types", ctx.getNumOutputs() - 1); + RecvShapeInfer(ctx); }); ONNX_CONTRIB_OPERATOR_SCHEMA(MegatronF) From dbff0cd09860b60bd0a251c1dbe76785b0b2818c Mon Sep 17 00:00:00 2001 From: Yulong Wang <7679871+fs-eire@users.noreply.github.com> Date: Sun, 28 Jul 2024 13:03:17 -0700 Subject: [PATCH 41/57] [js/node] enable float16 support for Node.js binding (#20581) ### Description enable float16 support for Node.js binding. data of float16 tensor uses `Uint16Array`. --- js/node/src/tensor_helper.cc | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/js/node/src/tensor_helper.cc b/js/node/src/tensor_helper.cc index 1c0b141e6a44f..1062d89f76c5f 100644 --- a/js/node/src/tensor_helper.cc +++ b/js/node/src/tensor_helper.cc @@ -38,13 +38,13 @@ constexpr size_t DATA_TYPE_ELEMENT_SIZE_MAP[] = { 2, // ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT16 2, // ONNX_TENSOR_ELEMENT_DATA_TYPE_INT16 4, // ONNX_TENSOR_ELEMENT_DATA_TYPE_INT32 - 8, // ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64 INT64 not working in Javascript + 8, // ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64 0, // ONNX_TENSOR_ELEMENT_DATA_TYPE_STRING N/A 1, // ONNX_TENSOR_ELEMENT_DATA_TYPE_BOOL - 0, // ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16 FLOAT16 not working in Javascript + 2, // ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16 8, // ONNX_TENSOR_ELEMENT_DATA_TYPE_DOUBLE 4, // ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT32 - 8, // ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT64 UINT64 not working in Javascript + 8, // ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT64 0, // ONNX_TENSOR_ELEMENT_DATA_TYPE_COMPLEX64 not supported 0, // ONNX_TENSOR_ELEMENT_DATA_TYPE_COMPLEX128 not supported 0 // ONNX_TENSOR_ELEMENT_DATA_TYPE_BFLOAT16 not supported @@ -60,13 +60,13 @@ constexpr napi_typedarray_type DATA_TYPE_TYPEDARRAY_MAP[] = { napi_uint16_array, // ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT16 napi_int16_array, // ONNX_TENSOR_ELEMENT_DATA_TYPE_INT16 napi_int32_array, // ONNX_TENSOR_ELEMENT_DATA_TYPE_INT32 - napi_bigint64_array, // ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64 INT64 not working i + napi_bigint64_array, // ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64 (napi_typedarray_type)(-1), // ONNX_TENSOR_ELEMENT_DATA_TYPE_STRING not supported napi_uint8_array, // ONNX_TENSOR_ELEMENT_DATA_TYPE_BOOL - (napi_typedarray_type)(-1), // ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16 FLOAT16 not working + napi_uint16_array, // ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16 FLOAT16 uses Uint16Array napi_float64_array, // ONNX_TENSOR_ELEMENT_DATA_TYPE_DOUBLE napi_uint32_array, // ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT32 - napi_biguint64_array, // ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT64 UINT64 not working + napi_biguint64_array, // ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT64 (napi_typedarray_type)(-1), // ONNX_TENSOR_ELEMENT_DATA_TYPE_COMPLEX64 not supported (napi_typedarray_type)(-1), // ONNX_TENSOR_ELEMENT_DATA_TYPE_COMPLEX128 not supported (napi_typedarray_type)(-1) // ONNX_TENSOR_ELEMENT_DATA_TYPE_BFLOAT16 not supported @@ -182,9 +182,7 @@ Ort::Value NapiValueToOrtValue(Napi::Env env, Napi::Value value, OrtMemoryInfo * char *buffer = reinterpret_cast(tensorDataTypedArray.ArrayBuffer().Data()); size_t bufferByteOffset = tensorDataTypedArray.ByteOffset(); - // there is a bug in TypedArray::ElementSize(): https://github.com/nodejs/node-addon-api/pull/705 - // TODO: change to TypedArray::ByteLength() in next node-addon-api release. - size_t bufferByteLength = tensorDataTypedArray.ElementLength() * DATA_TYPE_ELEMENT_SIZE_MAP[elemType]; + size_t bufferByteLength = tensorDataTypedArray.ByteLength(); return Ort::Value::CreateTensor(memory_info, buffer + bufferByteOffset, bufferByteLength, dims.empty() ? nullptr : &dims[0], dims.size(), elemType); } From 5bc12bf209304e7f5800845bd612bb3e7b7ab918 Mon Sep 17 00:00:00 2001 From: Xu Xing Date: Mon, 29 Jul 2024 23:47:41 +0800 Subject: [PATCH 42/57] [js/webgpu] Add activation for conv3d naive (#21466) ### Description ### Motivation and Context --- .../ops/3rd-party/conv3d_naive_webgpu.ts | 64 +++++----- js/web/test/data/ops/fused-conv3dncdhw.jsonc | 112 ++++++++++++++++++ js/web/test/suite-test-list.jsonc | 1 + 3 files changed, 149 insertions(+), 28 deletions(-) create mode 100644 js/web/test/data/ops/fused-conv3dncdhw.jsonc diff --git a/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/conv3d_naive_webgpu.ts b/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/conv3d_naive_webgpu.ts index f428293add599..a2e5428385101 100644 --- a/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/conv3d_naive_webgpu.ts +++ b/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/conv3d_naive_webgpu.ts @@ -26,6 +26,9 @@ import {ShapeUtil} from '../../../util'; import {ProgramInfo, ProgramInputTensorInfoDependency, ProgramUniform} from '../../types'; import {createTensorShapeVariables, getElementAt, inputVariable, outputVariable, ShaderHelper, tensorTypeToWsglStorageType, UniformsArrayType} from '../common'; import {ConvAttributes} from '../conv'; +import {appendActivationUniforms, appendActivationUniformsData, getActivationSnippet} from '../fuse-utils'; + +import {typeSnippet} from './activation_util'; const arrayProduct = (arr: number[]) => { let product = 1; @@ -218,8 +221,8 @@ export const computeConv3DInfo = export const createConv3DNaiveProgramInfo = (inputs: readonly TensorView[], attributes: ConvAttributes, outputShape: readonly number[], filterDims: readonly number[], pads: readonly number[], dataFormat: string): ProgramInfo => { - const isChannelsLast = dataFormat === 'channelsLast'; - const inChannels = isChannelsLast ? inputs[0].dims[3] : inputs[0].dims[1]; + const isChannelLast = dataFormat === 'channelsLast'; + const inChannels = isChannelLast ? inputs[0].dims[3] : inputs[0].dims[1]; // TODO: enable vec4. const isVec4 = false; const workGroupSize: [number, number, number] = [64, 1, 1]; @@ -228,13 +231,14 @@ export const createConv3DNaiveProgramInfo = LOG_DEBUG('verbose', () => `[conv3d_naive_webgpu] dispatch = ${dispatch}`); - const innerElementSize = isVec4 ? (isChannelsLast && inChannels % 4 !== 0 ? 3 : 4) : 1; + const innerElementSize = isVec4 ? (isChannelLast && inChannels % 4 !== 0 ? 3 : 4) : 1; const outputSize = ShapeUtil.size(outputShape); const programUniforms: ProgramUniform[] = [ {type: DataType.uint32, data: outputSize}, {type: DataType.uint32, data: filterDims}, {type: DataType.uint32, data: pads}, {type: DataType.uint32, data: attributes.strides}, {type: DataType.uint32, data: attributes.dilations} ]; + appendActivationUniformsData(attributes, programUniforms); programUniforms.push(...createTensorShapeVariables(inputs[0].dims, inputs[1].dims)); const inputDependencies: ProgramInputTensorInfoDependency[] = ['rank', 'rank']; const hasBias = inputs.length === 3; @@ -251,6 +255,7 @@ export const createConv3DNaiveProgramInfo = {name: 'strides', type: 'u32', length: attributes.strides.length}, {name: 'dilations', type: 'u32', length: attributes.dilations.length} ]; + appendActivationUniforms(attributes, uniforms); // TODO: support component 2, 3. const components = isVec4 ? 4 : 1; const t = tensorTypeToWsglStorageType(inputs[0].dataType); @@ -266,10 +271,12 @@ export const createConv3DNaiveProgramInfo = inputVariables.push(bias); declareFunctions += ` fn getBiasByOutputCoords(coords : array) -> ${isVec4 ? `vec4<${t}>` : t} { - return bias[${isChannelsLast ? getElementAt('coords', 4, 5) : getElementAt('coords', 1, 5)}${ + return bias[${isChannelLast ? getElementAt('coords', 4, 5) : getElementAt('coords', 1, 5)}${ isVec4 ? '/ 4' : ''}]; }`; } + const resType = typeSnippet(innerElementSize, t); + const applyActivation = getActivationSnippet(attributes, resType, t); return ` ${declareFunctions} @@ -287,28 +294,28 @@ export const createConv3DNaiveProgramInfo = let coords = ${output.offsetToIndices('global_idx')}; let batch = ${getElementAt('coords', 0, x.rank)}; let d2 = ${ - isChannelsLast ? getElementAt('coords', x.rank - 1, x.rank) : getElementAt('coords', 1, x.rank)}; + isChannelLast ? getElementAt('coords', x.rank - 1, x.rank) : getElementAt('coords', 1, x.rank)}; let xFRCCorner = vec3(${ - isChannelsLast ? getElementAt('coords', 1, x.rank) : getElementAt('coords', 2, x.rank)}, - ${isChannelsLast ? getElementAt('coords', 2, x.rank) : getElementAt('coords', 3, x.rank)}, + isChannelLast ? getElementAt('coords', 1, x.rank) : getElementAt('coords', 2, x.rank)}, + ${isChannelLast ? getElementAt('coords', 2, x.rank) : getElementAt('coords', 3, x.rank)}, ${ - isChannelsLast ? getElementAt('coords', 3, x.rank) : - getElementAt('coords', 4, x.rank)}) * uniforms.strides - uniforms.pads; + isChannelLast ? getElementAt('coords', 3, x.rank) : + getElementAt('coords', 4, x.rank)}) * uniforms.strides - uniforms.pads; let xFCorner = xFRCCorner.x; let xRCorner = xFRCCorner.y; let xCCorner = xFRCCorner.z; let xShapeY = ${ - isChannelsLast ? getElementAt('uniforms.x_shape', 1, x.rank) : getElementAt('uniforms.x_shape', 2, x.rank)}; + isChannelLast ? getElementAt('uniforms.x_shape', 1, x.rank) : getElementAt('uniforms.x_shape', 2, x.rank)}; let xShapeZ = ${ - isChannelsLast ? getElementAt('uniforms.x_shape', 2, x.rank) : getElementAt('uniforms.x_shape', 3, x.rank)}; + isChannelLast ? getElementAt('uniforms.x_shape', 2, x.rank) : getElementAt('uniforms.x_shape', 3, x.rank)}; let xShapeW = ${ - isChannelsLast ? getElementAt('uniforms.x_shape', 3, x.rank) : getElementAt('uniforms.x_shape', 4, x.rank)}; + isChannelLast ? getElementAt('uniforms.x_shape', 3, x.rank) : getElementAt('uniforms.x_shape', 4, x.rank)}; let xShapeU = ${ - isChannelsLast ? getElementAt('uniforms.x_shape', 4, x.rank) : getElementAt('uniforms.x_shape', 1, x.rank)}; + isChannelLast ? getElementAt('uniforms.x_shape', 4, x.rank) : getElementAt('uniforms.x_shape', 1, x.rank)}; let inputDepthNearestVec4 = (xShapeU / 4) * 4; let inputDepthVec4Remainder = xShapeU % 4; - var dotProd = 0.0; + var value = 0.0; for (var wF = 0u; wF < uniforms.filter_dims[0]; wF++) { let xF = xFCorner + wF * uniforms.dilations[0]; if (xF < 0 || xF >= xShapeY) { @@ -329,13 +336,13 @@ export const createConv3DNaiveProgramInfo = for (var d1 = 0u; d1 < inputDepthNearestVec4; d1 += 4) { ${ - isChannelsLast ? `let xValues = vec4( + isChannelLast ? `let xValues = vec4( getX(batch, xF, xR, xC, d1), getX(batch, xF, xR, xC, d1 + 1), getX(batch, xF, xR, xC, d1 + 2), getX(batch, xF, xR, xC, d1 + 3)); ` : - `let xValues = vec4( + `let xValues = vec4( getX(batch, d1, xF, xR, xC), getX(batch, d1 + 1, xF, xR, xC), getX(batch, d1 + 2, xF, xR, xC), @@ -346,36 +353,36 @@ export const createConv3DNaiveProgramInfo = getW(d2, d1 + 1, wF, wR, wC), getW(d2, d1 + 2, wF, wR, wC), getW(d2, d1 + 3, wF, wR, wC)); - dotProd += dot(xValues, wValues); + value += dot(xValues, wValues); } if (inputDepthVec4Remainder == 1) { ${ - isChannelsLast ? `dotProd += getX(batch, xF, xR, xC, inputDepthNearestVec4) + isChannelLast ? `value += getX(batch, xF, xR, xC, inputDepthNearestVec4) * getW(d2, inputDepthNearestVec4, wF, wR, wC);` : - `dotProd += getX(batch, inputDepthNearestVec4, xF, xR, xC) + `value += getX(batch, inputDepthNearestVec4, xF, xR, xC) * getW(d2, inputDepthNearestVec4, wF, wR, wC);`} } else if (inputDepthVec4Remainder == 2) { ${ - isChannelsLast ? `let xValues = vec2( + isChannelLast ? `let xValues = vec2( getX(batch, xF, xR, xC, inputDepthNearestVec4), getX(batch, xF, xR, xC, inputDepthNearestVec4 + 1)); ` : - `let xValues = vec2( + `let xValues = vec2( getX(batch, inputDepthNearestVec4, xF, xR, xC), getX(batch, inputDepthNearestVec4 + 1, xF, xR, xC)); `} let wValues = vec2( getW(d2, inputDepthNearestVec4, wF, wR, wC), getW(d2, inputDepthNearestVec4 + 1, wF, wR, wC)); - dotProd += dot(xValues, wValues); + value += dot(xValues, wValues); } else if (inputDepthVec4Remainder == 3) { ${ - isChannelsLast ? `let xValues = vec3( + isChannelLast ? `let xValues = vec3( getX(batch, xF, xR, xC, inputDepthNearestVec4), getX(batch, xF, xR, xC, inputDepthNearestVec4 + 1), getX(batch, xF, xR, xC, inputDepthNearestVec4 + 2)); ` : - `let xValues = vec3( + `let xValues = vec3( getX(batch, inputDepthNearestVec4, xF, xR, xC), getX(batch, inputDepthNearestVec4 + 1, xF, xR, xC), getX(batch, inputDepthNearestVec4 + 2, xF, xR, xC)); @@ -384,19 +391,20 @@ export const createConv3DNaiveProgramInfo = getW(d2, inputDepthNearestVec4, wF, wR, wC), getW(d2, inputDepthNearestVec4 + 1, wF, wR, wC), getW(d2, inputDepthNearestVec4 + 2, wF, wR, wC)); - dotProd += dot(xValues, wValues); + value += dot(xValues, wValues); } } } } - ${hasBias ? 'dotProd = dotProd + getBiasByOutputCoords(coords)' : ''}; - result[global_idx] = f32(dotProd); + ${hasBias ? 'value = value + getBiasByOutputCoords(coords)' : ''}; + ${applyActivation} + result[global_idx] = f32(value); }`; }; return { name: 'Conv3DNaive', shaderCache: - {hint: `${attributes.cacheKey};${isChannelsLast};${innerElementSize};${hasBias}`, inputDependencies}, + {hint: `${attributes.cacheKey};${isChannelLast};${innerElementSize};${hasBias}`, inputDependencies}, getRunData: () => ({ outputs: [{dims: outputShape, dataType: inputs[0].dataType}], dispatchGroup: {x: dispatch[0], y: dispatch[1], z: dispatch[2]}, diff --git a/js/web/test/data/ops/fused-conv3dncdhw.jsonc b/js/web/test/data/ops/fused-conv3dncdhw.jsonc new file mode 100644 index 0000000000000..1801ca380aa09 --- /dev/null +++ b/js/web/test/data/ops/fused-conv3dncdhw.jsonc @@ -0,0 +1,112 @@ +[ + { + "name": "fused conv3d with relu, x=[1, 1, 2, 1, 2], f=[2, 1, 2, 1, 2], s=1, d=1, p=valid, relu", + "operator": "FusedConv", + "opset": { "domain": "com.microsoft", "version": 1 }, + "attributes": [ + { "name": "activation", "data": "Relu", "type": "string" }, + { "name": "kernel_shape", "data": [2, 1, 2], "type": "ints" }, + { "name": "auto_pad", "data": "VALID", "type": "string" }, + { "name": "strides", "data": [1, 1, 1], "type": "ints" }, + { "name": "dilations", "data": [1, 1, 1], "type": "ints" } + ], + "cases": [ + { + "name": "T[0]", + "inputs": [ + { + "data": [0.25, 0.5, 0.75, 1], + "dims": [1, 1, 2, 1, 2], + "type": "float32" + }, + { + "data": [-0.125, -0.25, -0.375, 0.5, 0.625, -0.75, -0.875, -1], + "dims": [2, 1, 2, 1, 2], + "type": "float32" + } + ], + "outputs": [ + { + "data": [0.0625, 0], + "dims": [1, 2, 1, 1, 1], + "type": "float32" + } + ] + } + ] + }, + { + "name": "fused conv3d with clip", + "operator": "FusedConv", + "opset": { "domain": "com.microsoft", "version": 1 }, + "attributes": [ + { "name": "activation", "data": "Clip", "type": "string" }, + { "name": "activation_params", "data": [1.0, 3.0], "type": "floats" }, + { "name": "kernel_shape", "data": [2, 1, 2], "type": "ints" }, + { "name": "auto_pad", "data": "VALID", "type": "string" }, + { "name": "strides", "data": [1, 1, 1], "type": "ints" }, + { "name": "dilations", "data": [1, 1, 1], "type": "ints" } + ], + "cases": [ + { + "name": "T[0]", + "inputs": [ + { + "data": [0.25, 0.5, 0.75, 1], + "dims": [1, 1, 2, 1, 2], + "type": "float32" + }, + { + "data": [0.125, 0.25, 0.375, 0.5, 0.625, 0.75, 0.875, 1], + "dims": [2, 1, 2, 1, 2], + "type": "float32" + } + ], + "outputs": [ + { + "data": [1, 2.1875], + "dims": [1, 2, 1, 1, 1], + "type": "float32" + } + ] + } + ] + }, + { + "name": "fused conv3d with HardSigmoid, x=[1, 1, 2, 1, 2], f=[2, 1, 2, 1, 2], s=1, d=1, p=valid, relu", + "operator": "FusedConv", + "opset": { "domain": "com.microsoft", "version": 1 }, + "attributes": [ + { "name": "activation", "data": "HardSigmoid", "type": "string" }, + { "name": "activation_params", "data": [0.1, 0.3], "type": "floats" }, + { "name": "kernel_shape", "data": [2, 1, 2], "type": "ints" }, + { "name": "auto_pad", "data": "VALID", "type": "string" }, + { "name": "strides", "data": [1, 1, 1], "type": "ints" }, + { "name": "dilations", "data": [1, 1, 1], "type": "ints" } + ], + "cases": [ + { + "name": "T[0]", + "inputs": [ + { + "data": [0.25, 0.5, 0.75, 1], + "dims": [1, 1, 2, 1, 2], + "type": "float32" + }, + { + "data": [0.125, 0.25, 0.375, 0.5, 0.625, 0.75, 0.875, 1], + "dims": [2, 1, 2, 1, 2], + "type": "float32" + } + ], + "outputs": [ + { + "data": [0.39375001192092896, 0.518750011920929], + "dims": [1, 2, 1, 1, 1], + "type": "float32" + } + ] + } + ] + } +] diff --git a/js/web/test/suite-test-list.jsonc b/js/web/test/suite-test-list.jsonc index 4a3a23bfe91b4..4aaf9d16b2b0e 100644 --- a/js/web/test/suite-test-list.jsonc +++ b/js/web/test/suite-test-list.jsonc @@ -1358,6 +1358,7 @@ "fast-gelu.jsonc", "floor.jsonc", "fused-conv.jsonc", + "fused-conv3dncdhw.jsonc", "gather-elements.jsonc", "gemm.jsonc", "global-average-pool.jsonc", From 94eb70d98348d83343207e113f9abaa0e7c6ea37 Mon Sep 17 00:00:00 2001 From: Wanming Lin Date: Mon, 29 Jul 2024 23:50:14 +0800 Subject: [PATCH 43/57] [WebNN EP] Add labels for all WebNN operators (#21516) In order to provide more diagnosable error messages for developers. Spec change: https://github.com/webmachinelearning/webnn/pull/742 --- .../builders/impl/activation_op_builder.cc | 13 ++++--- .../builders/impl/argmax_min_op_builder.cc | 1 + .../webnn/builders/impl/binary_op_builder.cc | 15 ++++--- .../webnn/builders/impl/cast_op_builder.cc | 5 ++- .../webnn/builders/impl/clip_op_builder.cc | 1 + .../webnn/builders/impl/concat_op_builder.cc | 5 ++- .../webnn/builders/impl/conv_op_builder.cc | 16 +++++++- .../impl/dequantizeLinear_op_builder.cc | 17 ++++++-- .../impl/dynamicQuantizeLinear_op_builder.cc | 3 +- .../webnn/builders/impl/expand_op_builder.cc | 6 ++- .../webnn/builders/impl/flatten_op_builder.cc | 4 +- .../webnn/builders/impl/gather_op_builder.cc | 1 + .../webnn/builders/impl/gemm_op_builder.cc | 39 +++++++++++++++---- .../webnn/builders/impl/logical_op_builder.cc | 12 +++--- .../webnn/builders/impl/max_min_op_builder.cc | 10 +++-- .../builders/impl/normalization_op_builder.cc | 16 ++++++-- .../webnn/builders/impl/pad_op_builder.cc | 6 ++- .../webnn/builders/impl/pool_op_builder.cc | 1 + .../builders/impl/reduction_op_builder.cc | 1 + .../webnn/builders/impl/reshape_op_builder.cc | 7 +++- .../webnn/builders/impl/resize_op_builder.cc | 1 + .../webnn/builders/impl/shape_op_builder.cc | 9 ++++- .../webnn/builders/impl/slice_op_builder.cc | 5 ++- .../webnn/builders/impl/softmax_op_builder.cc | 4 +- .../webnn/builders/impl/split_op_builder.cc | 1 + .../impl/squeeze_unsqueeze_op_builder.cc | 8 +++- .../webnn/builders/impl/ternary_op_builder.cc | 4 +- .../builders/impl/transpose_op_builder.cc | 1 + .../builders/impl/triangular_op_builder.cc | 1 + .../webnn/builders/impl/unary_op_builder.cc | 30 +++++++------- 30 files changed, 180 insertions(+), 63 deletions(-) diff --git a/onnxruntime/core/providers/webnn/builders/impl/activation_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/activation_op_builder.cc index af0f0133b497a..626aaf5c71b74 100644 --- a/onnxruntime/core/providers/webnn/builders/impl/activation_op_builder.cc +++ b/onnxruntime/core/providers/webnn/builders/impl/activation_op_builder.cc @@ -36,6 +36,7 @@ Status ActivationOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, NodeAttrHelper helper(node); emscripten::val options = emscripten::val::object(); + options.set("label", node.Name()); if (op_type == "Elu") { options.set("alpha", helper.Get("alpha", 1.0f)); output = model_builder.GetBuilder().call("elu", input, options); @@ -46,20 +47,20 @@ Status ActivationOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, options.set("beta", helper.Get("beta", 0.5f)); output = model_builder.GetBuilder().call("hardSigmoid", input, options); } else if (op_type == "HardSwish") { - output = model_builder.GetBuilder().call("hardSwish", input); + output = model_builder.GetBuilder().call("hardSwish", input, options); } else if (op_type == "LeakyRelu") { options.set("alpha", helper.Get("alpha", 0.0f)); output = model_builder.GetBuilder().call("leakyRelu", input, options); } else if (op_type == "Relu") { - output = model_builder.GetBuilder().call("relu", input); + output = model_builder.GetBuilder().call("relu", input, options); } else if (op_type == "Sigmoid") { - output = model_builder.GetBuilder().call("sigmoid", input); + output = model_builder.GetBuilder().call("sigmoid", input, options); } else if (op_type == "Softplus") { - output = model_builder.GetBuilder().call("softplus", input); + output = model_builder.GetBuilder().call("softplus", input, options); } else if (op_type == "Softsign") { - output = model_builder.GetBuilder().call("softsign", input); + output = model_builder.GetBuilder().call("softsign", input, options); } else if (op_type == "Tanh") { - output = model_builder.GetBuilder().call("tanh", input); + output = model_builder.GetBuilder().call("tanh", input, options); } else { return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "ActivationOpBuilder::AddToModelBuilderImpl, unknown op: ", op_type); diff --git a/onnxruntime/core/providers/webnn/builders/impl/argmax_min_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/argmax_min_op_builder.cc index 1ae63a644a287..05f3a742a3775 100644 --- a/onnxruntime/core/providers/webnn/builders/impl/argmax_min_op_builder.cc +++ b/onnxruntime/core/providers/webnn/builders/impl/argmax_min_op_builder.cc @@ -47,6 +47,7 @@ Status ArgMaxMinOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, options.set("keepDimensions", keep_dims == 1); // TODO(Honry): check whether int64 output data type is supported by WebNN opSupportLimits() API. options.set("outputDataType", "int64"); + options.set("label", node.Name()); emscripten::val output = emscripten::val::object(); const auto& op_type = node.OpType(); diff --git a/onnxruntime/core/providers/webnn/builders/impl/binary_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/binary_op_builder.cc index 23e19d5943144..555de68cd60fe 100644 --- a/onnxruntime/core/providers/webnn/builders/impl/binary_op_builder.cc +++ b/onnxruntime/core/providers/webnn/builders/impl/binary_op_builder.cc @@ -35,18 +35,21 @@ Status BinaryOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const emscripten::val input0 = model_builder.GetOperand(node.InputDefs()[0]->Name()); emscripten::val input1 = model_builder.GetOperand(node.InputDefs()[1]->Name()); emscripten::val output = emscripten::val::object(); + emscripten::val options = emscripten::val::object(); + options.set("label", node.Name()); + if (op_type == "Add") { - output = model_builder.GetBuilder().call("add", input0, input1); + output = model_builder.GetBuilder().call("add", input0, input1, options); } else if (op_type == "Sub") { - output = model_builder.GetBuilder().call("sub", input0, input1); + output = model_builder.GetBuilder().call("sub", input0, input1, options); } else if (op_type == "Mul") { - output = model_builder.GetBuilder().call("mul", input0, input1); + output = model_builder.GetBuilder().call("mul", input0, input1, options); } else if (op_type == "Div") { - output = model_builder.GetBuilder().call("div", input0, input1); + output = model_builder.GetBuilder().call("div", input0, input1, options); } else if (op_type == "Pow") { - output = model_builder.GetBuilder().call("pow", input0, input1); + output = model_builder.GetBuilder().call("pow", input0, input1, options); } else if (op_type == "PRelu") { - output = model_builder.GetBuilder().call("prelu", input0, input1); + output = model_builder.GetBuilder().call("prelu", input0, input1, options); } else { return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "BinaryOpBuilder::AddToModelBuilderImpl, unknown op: ", op_type); diff --git a/onnxruntime/core/providers/webnn/builders/impl/cast_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/cast_op_builder.cc index a97d71b90de55..a08e1681a8464 100644 --- a/onnxruntime/core/providers/webnn/builders/impl/cast_op_builder.cc +++ b/onnxruntime/core/providers/webnn/builders/impl/cast_op_builder.cc @@ -69,8 +69,11 @@ Status CastOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, node.Name(), " type: ", to_type); } + emscripten::val options = emscripten::val::object(); + options.set("label", node.Name()); + emscripten::val output = - model_builder.GetBuilder().call("cast", input, emscripten::val(operand_type)); + model_builder.GetBuilder().call("cast", input, emscripten::val(operand_type), options); model_builder.AddOperand(node.OutputDefs()[0]->Name(), std::move(output)); return Status::OK(); diff --git a/onnxruntime/core/providers/webnn/builders/impl/clip_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/clip_op_builder.cc index e6403a4cd12dc..b5c3206072d50 100644 --- a/onnxruntime/core/providers/webnn/builders/impl/clip_op_builder.cc +++ b/onnxruntime/core/providers/webnn/builders/impl/clip_op_builder.cc @@ -53,6 +53,7 @@ Status ClipOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, "GetClipMinMax failed"); options.set("minValue", minValue); options.set("maxValue", maxValue); + options.set("label", node.Name()); emscripten::val input = model_builder.GetOperand(input_name); emscripten::val output = model_builder.GetBuilder().call("clamp", input, options); diff --git a/onnxruntime/core/providers/webnn/builders/impl/concat_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/concat_op_builder.cc index e4f98b09e03c5..dedc76b80e978 100644 --- a/onnxruntime/core/providers/webnn/builders/impl/concat_op_builder.cc +++ b/onnxruntime/core/providers/webnn/builders/impl/concat_op_builder.cc @@ -42,8 +42,11 @@ Status ConcatOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, inputs.push_back(model_builder.GetOperand(input->Name())); } + emscripten::val options = emscripten::val::object(); + options.set("label", node.Name()); + emscripten::val output = - model_builder.GetBuilder().call("concat", emscripten::val::array(inputs), axis); + model_builder.GetBuilder().call("concat", emscripten::val::array(inputs), axis, options); model_builder.AddOperand(node.OutputDefs()[0]->Name(), std::move(output)); return Status::OK(); diff --git a/onnxruntime/core/providers/webnn/builders/impl/conv_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/conv_op_builder.cc index 320aaa03930fd..4f3f7459a7b5b 100644 --- a/onnxruntime/core/providers/webnn/builders/impl/conv_op_builder.cc +++ b/onnxruntime/core/providers/webnn/builders/impl/conv_op_builder.cc @@ -242,6 +242,7 @@ Status ConvOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const N } emscripten::val options = emscripten::val::object(); + options.set("label", node.Name()); ORT_RETURN_IF_ERROR(SetConvBaseOptions( model_builder, node, options, input_shape, weight_shape, strides, dilations, pads, is_nhwc, is_conv1d, logger)); bool depthwise = false; @@ -276,7 +277,12 @@ Status ConvOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const N if (!is_nhwc || !is_constant_weight) { // The weight_shape has been appended 1's, reshape weight operand. std::vector new_shape = GetVecUint32FromVecInt64(weight_shape); - filter = model_builder.GetBuilder().call("reshape", filter, emscripten::val::array(new_shape)); + emscripten::val reshape_options = emscripten::val::object(); + reshape_options.set("label", node.Name() + "_reshape_filter"); + filter = model_builder.GetBuilder().call("reshape", + filter, + emscripten::val::array(new_shape), + reshape_options); } } @@ -293,6 +299,7 @@ Status ConvOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const N perm = {0, 2, 3, 1}; // L_0231 } transpose_options.set("permutation", emscripten::val::array(perm)); + transpose_options.set("label", node.Name() + "_transpose_filter"); filter = model_builder.GetBuilder().call("transpose", filter, transpose_options); } @@ -323,7 +330,12 @@ Status ConvOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const N std::vector output_shape; ORT_RETURN_IF_NOT(GetShape(*output_defs[0], output_shape, logger), "Cannot get output shape"); std::vector new_shape = GetVecUint32FromVecInt64(output_shape); - output = model_builder.GetBuilder().call("reshape", output, emscripten::val::array(new_shape)); + emscripten::val reshape_options = emscripten::val::object(); + reshape_options.set("label", node.Name() + "_reshape_output"); + output = model_builder.GetBuilder().call("reshape", + output, + emscripten::val::array(new_shape), + reshape_options); } model_builder.AddOperand(node.OutputDefs()[0]->Name(), std::move(output)); diff --git a/onnxruntime/core/providers/webnn/builders/impl/dequantizeLinear_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/dequantizeLinear_op_builder.cc index 66d502a4e6727..93a12a696cce1 100644 --- a/onnxruntime/core/providers/webnn/builders/impl/dequantizeLinear_op_builder.cc +++ b/onnxruntime/core/providers/webnn/builders/impl/dequantizeLinear_op_builder.cc @@ -50,11 +50,22 @@ Status DequantizeLinearOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_buil std::vector target_shape{static_cast(input_shape[axis])}; target_shape.insert(target_shape.begin(), axis, 1); target_shape.insert(target_shape.end(), input_shape.size() - axis - 1, 1); - scale = model_builder.GetBuilder().call("reshape", scale, emscripten::val::array(target_shape)); + emscripten::val reshape_scale_options = emscripten::val::object(); + reshape_scale_options.set("label", node.Name() + "_reshape_scale"); + scale = model_builder.GetBuilder().call("reshape", + scale, + emscripten::val::array(target_shape), + reshape_scale_options); + emscripten::val reshape_zero_point_options = emscripten::val::object(); + reshape_zero_point_options.set("label", node.Name() + "_reshape_zero_point"); zero_point = model_builder.GetBuilder().call("reshape", - zero_point, emscripten::val::array(target_shape)); + zero_point, + emscripten::val::array(target_shape), + reshape_zero_point_options); } - output = model_builder.GetBuilder().call("dequantizeLinear", input, scale, zero_point); + emscripten::val options = emscripten::val::object(); + options.set("label", node.Name()); + output = model_builder.GetBuilder().call("dequantizeLinear", input, scale, zero_point, options); model_builder.AddOperand(node.OutputDefs()[0]->Name(), std::move(output)); diff --git a/onnxruntime/core/providers/webnn/builders/impl/dynamicQuantizeLinear_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/dynamicQuantizeLinear_op_builder.cc index 3b5f64584b828..55746bb1f61f0 100644 --- a/onnxruntime/core/providers/webnn/builders/impl/dynamicQuantizeLinear_op_builder.cc +++ b/onnxruntime/core/providers/webnn/builders/impl/dynamicQuantizeLinear_op_builder.cc @@ -31,8 +31,9 @@ Status DynamicQuantizaLinearOpBuilder::AddToModelBuilderImpl(ModelBuilder& model std::vector input_shape; ORT_RETURN_IF_NOT(GetShape(*input_defs[0], input_shape, logger), "Cannot get shape"); emscripten::val options = emscripten::val::object(); + options.set("label", node.Name()); - output_array = model_builder.GetBuilder().call("dynamicQuantizeLinear", input); + output_array = model_builder.GetBuilder().call("dynamicQuantizeLinear", input, options); for (size_t i = 0, count = output_array["length"].as(); i < count; i++) { model_builder.AddOperand(node.OutputDefs()[i]->Name(), std::move(output_array[i])); diff --git a/onnxruntime/core/providers/webnn/builders/impl/expand_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/expand_op_builder.cc index 9c75c00fa9273..c8cea833983b1 100644 --- a/onnxruntime/core/providers/webnn/builders/impl/expand_op_builder.cc +++ b/onnxruntime/core/providers/webnn/builders/impl/expand_op_builder.cc @@ -53,10 +53,14 @@ Status ExpandOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, std::vector output_shape; ORT_RETURN_IF_NOT(GetBidirectionalBroadcastShape(input_shape, new_shape, output_shape), "Cannot get output shape."); + emscripten::val options = emscripten::val::object(); + options.set("label", node.Name()); + emscripten::val output = model_builder.GetBuilder().call("expand", input, - emscripten::val::array(GetVecUint32FromVecInt64(output_shape))); + emscripten::val::array(GetVecUint32FromVecInt64(output_shape)), + options); model_builder.AddOperand(node.OutputDefs()[0]->Name(), std::move(output)); return Status::OK(); } diff --git a/onnxruntime/core/providers/webnn/builders/impl/flatten_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/flatten_op_builder.cc index 31b1bd92a9503..d0ece026a7048 100644 --- a/onnxruntime/core/providers/webnn/builders/impl/flatten_op_builder.cc +++ b/onnxruntime/core/providers/webnn/builders/impl/flatten_op_builder.cc @@ -52,8 +52,10 @@ Status FlattenOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, SafeInt(num_post_axis_elements)}; emscripten::val inputs = model_builder.GetOperand(input_defs[0]->Name()); + emscripten::val options = emscripten::val::object(); + options.set("label", node.Name()); emscripten::val output = model_builder.GetBuilder().call( - "reshape", inputs, emscripten::val::array(new_shape)); + "reshape", inputs, emscripten::val::array(new_shape), options); model_builder.AddOperand(node.OutputDefs()[0]->Name(), std::move(output)); return Status::OK(); diff --git a/onnxruntime/core/providers/webnn/builders/impl/gather_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/gather_op_builder.cc index 014a08616c44f..23233539d34c7 100644 --- a/onnxruntime/core/providers/webnn/builders/impl/gather_op_builder.cc +++ b/onnxruntime/core/providers/webnn/builders/impl/gather_op_builder.cc @@ -42,6 +42,7 @@ Status GatherOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, emscripten::val indices = model_builder.GetOperand(input_defs[1]->Name()); emscripten::val options = emscripten::val::object(); options.set("axis", axis); + options.set("label", node.Name()); emscripten::val output = model_builder.GetBuilder().call("gather", input, indices, options); model_builder.AddOperand(node.OutputDefs()[0]->Name(), std::move(output)); diff --git a/onnxruntime/core/providers/webnn/builders/impl/gemm_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/gemm_op_builder.cc index 53f885019ab2f..bd452b118fe3e 100644 --- a/onnxruntime/core/providers/webnn/builders/impl/gemm_op_builder.cc +++ b/onnxruntime/core/providers/webnn/builders/impl/gemm_op_builder.cc @@ -39,6 +39,8 @@ Status GemmOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const N emscripten::val a = model_builder.GetOperand(node.InputDefs()[a_idx]->Name()); emscripten::val b = model_builder.GetOperand(node.InputDefs()[b_idx]->Name()); emscripten::val output = emscripten::val::object(); + emscripten::val options = emscripten::val::object(); + options.set("label", node.Name()); if (op_type == "MatMul") { std::vector a_shape; if (!GetShape(*input_defs[a_idx], a_shape, logger)) { @@ -53,23 +55,34 @@ Status GemmOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const N if (a_shape.size() == 1) { extended_a_shape = true; a_shape.insert(a_shape.begin(), 1); + emscripten::val reshape_a_options = emscripten::val::object(); + reshape_a_options.set("label", node.Name() + "_reshape_a"); a = model_builder.GetBuilder().call("reshape", a, - emscripten::val::array(GetVecUint32FromVecInt64(a_shape))); + emscripten::val::array(GetVecUint32FromVecInt64(a_shape)), + reshape_a_options); } // If the second argument is 1-D, it is promoted to a matrix by appending a 1 to its dimensions. bool extended_b_shape = false; if (b_shape.size() == 1) { extended_b_shape = true; b_shape.push_back(1); + emscripten::val reshape_b_options = emscripten::val::object(); + reshape_b_options.set("label", node.Name() + "_reshape_b"); b = model_builder.GetBuilder().call("reshape", b, - emscripten::val::array(GetVecUint32FromVecInt64(b_shape))); + emscripten::val::array(GetVecUint32FromVecInt64(b_shape)), + reshape_b_options); } - output = model_builder.GetBuilder().call("matmul", a, b); + output = model_builder.GetBuilder().call("matmul", a, b, options); + emscripten::val reshape_output_options = emscripten::val::object(); + reshape_output_options.set("label", node.Name() + "_reshape_output"); // If the inputs are both 1D, reduce the output to a scalar. if (extended_a_shape && extended_b_shape) { - output = model_builder.GetBuilder().call("reshape", output, emscripten::val::array()); + output = model_builder.GetBuilder().call("reshape", + output, + emscripten::val::array(), + reshape_output_options); } // After matrix multiplication the prepended 1 is removed. else if (extended_a_shape) { @@ -78,7 +91,10 @@ Status GemmOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const N new_shape.push_back(narrow(b_shape[i])); } new_shape.push_back(narrow(b_shape.back())); - output = model_builder.GetBuilder().call("reshape", output, emscripten::val::array(new_shape)); + output = model_builder.GetBuilder().call("reshape", + output, + emscripten::val::array(new_shape), + reshape_output_options); } // After matrix multiplication the appended 1 is removed. else if (extended_b_shape) { @@ -86,7 +102,10 @@ Status GemmOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const N for (size_t i = 0; i < a_shape.size() - 1; i++) { new_shape.push_back(narrow(a_shape[i])); } - output = model_builder.GetBuilder().call("reshape", output, emscripten::val::array(new_shape)); + output = model_builder.GetBuilder().call("reshape", + output, + emscripten::val::array(new_shape), + reshape_output_options); } } else if (op_type == "MatMulInteger") { emscripten::val a_zero_point = emscripten::val::null(); @@ -101,9 +120,13 @@ Status GemmOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const N } else { b_zero_point = model_builder.GetZeroConstant("uint8"); } - output = model_builder.GetBuilder().call("matmulInteger", a, a_zero_point, b, b_zero_point); + output = model_builder.GetBuilder().call("matmulInteger", + a, + a_zero_point, + b, + b_zero_point, + options); } else { // Gemm - emscripten::val options = emscripten::val::object(); NodeAttrHelper helper(node); const auto transA = helper.Get("transA", 0); options.set("aTranspose", emscripten::val(transA == 1)); diff --git a/onnxruntime/core/providers/webnn/builders/impl/logical_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/logical_op_builder.cc index e56e8f6a3eb6d..23f3a938fee5e 100644 --- a/onnxruntime/core/providers/webnn/builders/impl/logical_op_builder.cc +++ b/onnxruntime/core/providers/webnn/builders/impl/logical_op_builder.cc @@ -33,16 +33,18 @@ Status LogicalOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, cons emscripten::val input0 = model_builder.GetOperand(node.InputDefs()[0]->Name()); emscripten::val input1 = model_builder.GetOperand(node.InputDefs()[1]->Name()); emscripten::val output = emscripten::val::object(); + emscripten::val options = emscripten::val::object(); + options.set("label", node.Name()); if (op_type == "Equal") { - output = model_builder.GetBuilder().call("equal", input0, input1); + output = model_builder.GetBuilder().call("equal", input0, input1, options); } else if (op_type == "Greater") { - output = model_builder.GetBuilder().call("greater", input0, input1); + output = model_builder.GetBuilder().call("greater", input0, input1, options); } else if (op_type == "GreaterOrEqual") { - output = model_builder.GetBuilder().call("greaterOrEqual", input0, input1); + output = model_builder.GetBuilder().call("greaterOrEqual", input0, input1, options); } else if (op_type == "Less") { - output = model_builder.GetBuilder().call("lesser", input0, input1); + output = model_builder.GetBuilder().call("lesser", input0, input1, options); } else if (op_type == "LessOrEqual") { - output = model_builder.GetBuilder().call("lesserOrEqual", input0, input1); + output = model_builder.GetBuilder().call("lesserOrEqual", input0, input1, options); } else { return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "LogicalOpBuilder::AddToModelBuilderImpl, unknown op: ", op_type); diff --git a/onnxruntime/core/providers/webnn/builders/impl/max_min_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/max_min_op_builder.cc index 0168f59273545..1080fd0a3f943 100644 --- a/onnxruntime/core/providers/webnn/builders/impl/max_min_op_builder.cc +++ b/onnxruntime/core/providers/webnn/builders/impl/max_min_op_builder.cc @@ -43,22 +43,26 @@ Status MaxMinOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, ORT_RETURN_IF_NOT(op_type == "Max" || op_type == "Min", "MaxMinOpBuilder, unknown op: ", op_type); emscripten::val output = emscripten::val::object(); + emscripten::val options = emscripten::val::object(); + options.set("label", node.Name()); if (input_count == 1) { // For 1 input, just concat the single input as workaround. // TODO: use identity instead once it's available in WebNN. emscripten::val inputs = emscripten::val::array(); inputs.call("push", input0); - output = model_builder.GetBuilder().call("concat", inputs, 0); + output = model_builder.GetBuilder().call("concat", inputs, 0, options); } else { std::string webnn_op_name = op_type == "Max" ? "max" : "min"; emscripten::val input1 = model_builder.GetOperand(input_defs[1]->Name()); - output = model_builder.GetBuilder().call(webnn_op_name.c_str(), input0, input1); + output = model_builder.GetBuilder().call(webnn_op_name.c_str(), input0, input1, options); for (size_t input_index = 2; input_index < input_count; ++input_index) { emscripten::val next_input = model_builder.GetOperand(input_defs[input_index]->Name()); - output = model_builder.GetBuilder().call(webnn_op_name.c_str(), output, next_input); + emscripten::val next_options = emscripten::val::object(); + next_options.set("label", node.Name() + "_" + input_defs[input_index]->Name()); + output = model_builder.GetBuilder().call(webnn_op_name.c_str(), output, next_input, next_options); } } diff --git a/onnxruntime/core/providers/webnn/builders/impl/normalization_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/normalization_op_builder.cc index a2aa0df5586e3..4d068baf35e72 100644 --- a/onnxruntime/core/providers/webnn/builders/impl/normalization_op_builder.cc +++ b/onnxruntime/core/providers/webnn/builders/impl/normalization_op_builder.cc @@ -42,6 +42,7 @@ Status NormalizationOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder const auto rank = input_shape.size(); emscripten::val options = emscripten::val::object(); + options.set("label", node.Name()); std::vector scale_shape; ORT_RETURN_IF_NOT(GetShape(*input_defs[1], scale_shape, logger), "Cannot get scale shape"); @@ -116,7 +117,12 @@ Status NormalizationOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder new_shape.erase(insertion_point, insertion_point + excess_rank); *insertion_point = sum; } - input = model_builder.GetBuilder().call("reshape", input, emscripten::val::array(new_shape)); + emscripten::val reshape_input_options = emscripten::val::object(); + reshape_input_options.set("label", node.Name() + "_reshape_input"); + input = model_builder.GetBuilder().call("reshape", + input, + emscripten::val::array(new_shape), + reshape_input_options); } if (model_builder.GetPreferredLayout() == DataLayout::NHWC) { @@ -126,8 +132,12 @@ Status NormalizationOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder // Reshape back to the original output shape for 3D input. if (input_shape.size() != 4) { std::vector output_shape = GetVecUint32FromVecInt64(input_shape); - output = model_builder.GetBuilder().call( - "reshape", output, emscripten::val::array(output_shape)); + emscripten::val reshape_output_options = emscripten::val::object(); + reshape_output_options.set("label", node.Name() + "reshape_output"); + output = model_builder.GetBuilder().call("reshape", + output, + emscripten::val::array(output_shape), + reshape_output_options); } } else { return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Unsupported normalization op: ", op_type); diff --git a/onnxruntime/core/providers/webnn/builders/impl/pad_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/pad_op_builder.cc index bc90821ba4ed8..071155a2fb372 100644 --- a/onnxruntime/core/providers/webnn/builders/impl/pad_op_builder.cc +++ b/onnxruntime/core/providers/webnn/builders/impl/pad_op_builder.cc @@ -73,6 +73,7 @@ Status PadOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, ORT_RETURN_IF_NOT(GetShape(*input_defs[0], input_shape, logger), "Cannot get input shape"); emscripten::val options = emscripten::val::object(); + options.set("label", node.Name()); NodeAttrHelper helper(node); const auto pad_mode = helper.Get("mode", std::string("constant")); @@ -145,9 +146,12 @@ Status PadOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, starts.push_back(start_padding[i] >= 0 ? SafeInt(0) : SafeInt(-start_padding[i])); sizes.push_back(SafeInt(input_shape[i] + start_padding[i] + end_padding[i])); } + emscripten::val slice_options = emscripten::val::object(); + slice_options.set("label", node.Name() + "_slice_output"); output = model_builder.GetBuilder().call("slice", output, emscripten::val::array(starts), - emscripten::val::array(sizes)); + emscripten::val::array(sizes), + slice_options); } model_builder.AddOperand(node.OutputDefs()[0]->Name(), std::move(output)); return Status::OK(); diff --git a/onnxruntime/core/providers/webnn/builders/impl/pool_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/pool_op_builder.cc index 8b3eecf35fcc8..0af62dacedbd5 100644 --- a/onnxruntime/core/providers/webnn/builders/impl/pool_op_builder.cc +++ b/onnxruntime/core/providers/webnn/builders/impl/pool_op_builder.cc @@ -59,6 +59,7 @@ Status PoolOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, } emscripten::val options = emscripten::val::object(); + options.set("label", node.Name()); NodeAttrHelper helper(node); const auto kernel_shape = helper.Get("kernel_shape", std::vector{0, 0}); diff --git a/onnxruntime/core/providers/webnn/builders/impl/reduction_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/reduction_op_builder.cc index 461050849385a..3e6d4d9820e9a 100644 --- a/onnxruntime/core/providers/webnn/builders/impl/reduction_op_builder.cc +++ b/onnxruntime/core/providers/webnn/builders/impl/reduction_op_builder.cc @@ -57,6 +57,7 @@ Status ReductionOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, NodeAttrHelper helper(node); const auto keep_dims = helper.Get("keepdims", 1); emscripten::val options = emscripten::val::object(); + options.set("label", node.Name()); options.set("keepDimensions", keep_dims == 1); std::vector axes_data; diff --git a/onnxruntime/core/providers/webnn/builders/impl/reshape_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/reshape_op_builder.cc index b5005269b96a7..a7911683f0355 100644 --- a/onnxruntime/core/providers/webnn/builders/impl/reshape_op_builder.cc +++ b/onnxruntime/core/providers/webnn/builders/impl/reshape_op_builder.cc @@ -58,8 +58,13 @@ Status ReshapeOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, std::transform(target_shape.cbegin(), target_shape.cend(), std::back_inserter(new_shape), [](int64_t dim) -> uint32_t { return SafeInt(dim); }); + + emscripten::val options = emscripten::val::object(); + options.set("label", node.Name()); emscripten::val output = model_builder.GetBuilder().call("reshape", - input, emscripten::val::array(new_shape)); + input, + emscripten::val::array(new_shape), + options); model_builder.AddOperand(node.OutputDefs()[0]->Name(), std::move(output)); return Status::OK(); } diff --git a/onnxruntime/core/providers/webnn/builders/impl/resize_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/resize_op_builder.cc index c4ca980fec715..2218c858951d3 100644 --- a/onnxruntime/core/providers/webnn/builders/impl/resize_op_builder.cc +++ b/onnxruntime/core/providers/webnn/builders/impl/resize_op_builder.cc @@ -106,6 +106,7 @@ Status ResizeOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const Node& node, const logging::Logger& logger) const { emscripten::val options = emscripten::val::object(); + options.set("label", node.Name()); NodeAttrHelper helper(node); const auto mode = helper.Get("mode", "nearest"); if (mode == "linear") { diff --git a/onnxruntime/core/providers/webnn/builders/impl/shape_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/shape_op_builder.cc index 1552023d3f876..0eb7dafdffe4d 100644 --- a/onnxruntime/core/providers/webnn/builders/impl/shape_op_builder.cc +++ b/onnxruntime/core/providers/webnn/builders/impl/shape_op_builder.cc @@ -55,8 +55,15 @@ Status ShapeOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, emscripten::val sizes = emscripten::val::array(); sizes.call("push", slice_length); + emscripten::val options = emscripten::val::object(); + options.set("label", node.Name()); + // Since WebNN doesn't support Shape op, we use constant + slice ops as workaround. - emscripten::val output = model_builder.GetBuilder().call("slice", shape_constant, starts, sizes); + emscripten::val output = model_builder.GetBuilder().call("slice", + shape_constant, + starts, + sizes, + options); model_builder.AddOperand(node.OutputDefs()[0]->Name(), std::move(output)); return Status::OK(); diff --git a/onnxruntime/core/providers/webnn/builders/impl/slice_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/slice_op_builder.cc index fb452aec1c929..bef13841c646c 100644 --- a/onnxruntime/core/providers/webnn/builders/impl/slice_op_builder.cc +++ b/onnxruntime/core/providers/webnn/builders/impl/slice_op_builder.cc @@ -97,9 +97,12 @@ Status SliceOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, sizes.begin(), [](int64_t i, int64_t j) { return SafeInt(i - j); }); + emscripten::val options = emscripten::val::object(); + options.set("label", node.Name()); emscripten::val output = model_builder.GetBuilder().call("slice", inputs, emscripten::val::array(starts), - emscripten::val::array(sizes)); + emscripten::val::array(sizes), + options); model_builder.AddOperand(node.OutputDefs()[0]->Name(), std::move(output)); return Status::OK(); diff --git a/onnxruntime/core/providers/webnn/builders/impl/softmax_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/softmax_op_builder.cc index 95c1dbd518061..798cfabae65db 100644 --- a/onnxruntime/core/providers/webnn/builders/impl/softmax_op_builder.cc +++ b/onnxruntime/core/providers/webnn/builders/impl/softmax_op_builder.cc @@ -42,7 +42,9 @@ Status SoftmaxOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, int32_t axis = helper.Get("axis", default_axis); axis = static_cast(HandleNegativeAxis(axis, input_size)); - emscripten::val output = model_builder.GetBuilder().call("softmax", input, axis); + emscripten::val options = emscripten::val::object(); + options.set("label", node.Name()); + emscripten::val output = model_builder.GetBuilder().call("softmax", input, axis, options); model_builder.AddOperand(node.OutputDefs()[0]->Name(), std::move(output)); return Status::OK(); } diff --git a/onnxruntime/core/providers/webnn/builders/impl/split_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/split_op_builder.cc index ea3b8ef384ddc..4c59b694d690a 100644 --- a/onnxruntime/core/providers/webnn/builders/impl/split_op_builder.cc +++ b/onnxruntime/core/providers/webnn/builders/impl/split_op_builder.cc @@ -49,6 +49,7 @@ Status SplitOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, ORT_RETURN_IF_NOT(GetShape(*input_defs[0], input_shape, logger), "Cannot get shape"); const size_t rank = input_shape.size(); emscripten::val options = emscripten::val::object(); + options.set("label", node.Name()); NodeAttrHelper helper(node); int32_t axis = helper.Get("axis", 0); diff --git a/onnxruntime/core/providers/webnn/builders/impl/squeeze_unsqueeze_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/squeeze_unsqueeze_op_builder.cc index 8e6feb62fa8c4..5eff96873b8c4 100644 --- a/onnxruntime/core/providers/webnn/builders/impl/squeeze_unsqueeze_op_builder.cc +++ b/onnxruntime/core/providers/webnn/builders/impl/squeeze_unsqueeze_op_builder.cc @@ -54,7 +54,6 @@ Status SqueezeUnsqueezeOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_buil ORT_RETURN_IF_NOT(GetShape(*input_defs[0], input_shape, logger), "Cannot get input shape"); const auto input_rank = input_shape.size(); - emscripten::val options = emscripten::val::object(); std::vector axes_data; auto rank = input_rank; @@ -111,7 +110,12 @@ Status SqueezeUnsqueezeOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_buil "SqueezeUnsqueezeOpBuilder::AddToModelBuilderImpl, unknown op: ", op_type); } - output = model_builder.GetBuilder().call("reshape", input, emscripten::val::array(new_shape)); + emscripten::val options = emscripten::val::object(); + options.set("label", node.Name()); + output = model_builder.GetBuilder().call("reshape", + input, + emscripten::val::array(new_shape), + options); model_builder.AddOperand(node.OutputDefs()[0]->Name(), std::move(output)); return Status::OK(); } diff --git a/onnxruntime/core/providers/webnn/builders/impl/ternary_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/ternary_op_builder.cc index 841e2d18244d5..2ed8330bf25be 100644 --- a/onnxruntime/core/providers/webnn/builders/impl/ternary_op_builder.cc +++ b/onnxruntime/core/providers/webnn/builders/impl/ternary_op_builder.cc @@ -32,9 +32,11 @@ Status TernaryOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, cons emscripten::val input0 = model_builder.GetOperand(node.InputDefs()[0]->Name()); emscripten::val input1 = model_builder.GetOperand(node.InputDefs()[1]->Name()); emscripten::val input2 = model_builder.GetOperand(node.InputDefs()[2]->Name()); + emscripten::val options = emscripten::val::object(); + options.set("label", node.Name()); emscripten::val output = emscripten::val::object(); if (op_type == "Where") { - output = model_builder.GetBuilder().call("where", input0, input1, input2); + output = model_builder.GetBuilder().call("where", input0, input1, input2, options); } else { return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "TernaryOpBuilder::AddToModelBuilderImpl, unknown op: ", op_type); diff --git a/onnxruntime/core/providers/webnn/builders/impl/transpose_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/transpose_op_builder.cc index 3921b1da188c3..03c88ad9db88a 100644 --- a/onnxruntime/core/providers/webnn/builders/impl/transpose_op_builder.cc +++ b/onnxruntime/core/providers/webnn/builders/impl/transpose_op_builder.cc @@ -42,6 +42,7 @@ Status TransposeOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, emscripten::val input = model_builder.GetOperand(input_defs[0]->Name()); emscripten::val options = emscripten::val::object(); + options.set("label", node.Name()); std::vector permutation = GetVecUint32FromVecInt64(perm); options.set("permutation", emscripten::val::array(permutation)); emscripten::val output = model_builder.GetBuilder().call("transpose", input, options); diff --git a/onnxruntime/core/providers/webnn/builders/impl/triangular_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/triangular_op_builder.cc index e4b7021d49b30..0c818533918a4 100644 --- a/onnxruntime/core/providers/webnn/builders/impl/triangular_op_builder.cc +++ b/onnxruntime/core/providers/webnn/builders/impl/triangular_op_builder.cc @@ -46,6 +46,7 @@ Status TriangularOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, emscripten::val output = emscripten::val::object(); NodeAttrHelper helper(node); emscripten::val options = emscripten::val::object(); + options.set("label", node.Name()); const bool upper = helper.Get("upper", 1); options.set("upper", upper); diff --git a/onnxruntime/core/providers/webnn/builders/impl/unary_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/unary_op_builder.cc index e0016de8e69b7..061404c8a9ce0 100644 --- a/onnxruntime/core/providers/webnn/builders/impl/unary_op_builder.cc +++ b/onnxruntime/core/providers/webnn/builders/impl/unary_op_builder.cc @@ -30,35 +30,37 @@ Status UnaryOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const emscripten::val input = model_builder.GetOperand(node.InputDefs()[0]->Name()); emscripten::val output = emscripten::val::object(); + emscripten::val options = emscripten::val::object(); + options.set("label", node.Name()); if (op_type == "Abs") { - output = model_builder.GetBuilder().call("abs", input); + output = model_builder.GetBuilder().call("abs", input, options); } else if (op_type == "Ceil") { - output = model_builder.GetBuilder().call("ceil", input); + output = model_builder.GetBuilder().call("ceil", input, options); } else if (op_type == "Cos") { - output = model_builder.GetBuilder().call("cos", input); + output = model_builder.GetBuilder().call("cos", input, options); } else if (op_type == "Erf") { - output = model_builder.GetBuilder().call("erf", input); + output = model_builder.GetBuilder().call("erf", input, options); } else if (op_type == "Exp") { - output = model_builder.GetBuilder().call("exp", input); + output = model_builder.GetBuilder().call("exp", input, options); } else if (op_type == "Floor") { - output = model_builder.GetBuilder().call("floor", input); + output = model_builder.GetBuilder().call("floor", input, options); } else if (op_type == "Identity") { - output = model_builder.GetBuilder().call("identity", input); + output = model_builder.GetBuilder().call("identity", input, options); } else if (op_type == "Log") { - output = model_builder.GetBuilder().call("log", input); + output = model_builder.GetBuilder().call("log", input, options); } else if (op_type == "Neg") { - output = model_builder.GetBuilder().call("neg", input); + output = model_builder.GetBuilder().call("neg", input, options); } else if (op_type == "Not") { - output = model_builder.GetBuilder().call("logicalNot", input); + output = model_builder.GetBuilder().call("logicalNot", input, options); } else if (op_type == "Reciprocal") { - output = model_builder.GetBuilder().call("reciprocal", input); + output = model_builder.GetBuilder().call("reciprocal", input, options); } else if (op_type == "Sin") { - output = model_builder.GetBuilder().call("sin", input); + output = model_builder.GetBuilder().call("sin", input, options); } else if (op_type == "Sqrt") { - output = model_builder.GetBuilder().call("sqrt", input); + output = model_builder.GetBuilder().call("sqrt", input, options); } else if (op_type == "Tan") { - output = model_builder.GetBuilder().call("tan", input); + output = model_builder.GetBuilder().call("tan", input, options); } else { return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "UnaryOpBuilder::AddToModelBuilderImpl, unknown op: ", op_type); From d8888136e3cdf29fa63d3b0a08a58683a7c9f0a0 Mon Sep 17 00:00:00 2001 From: mingyueliuh <131847423+mingyueliuh@users.noreply.github.com> Date: Mon, 29 Jul 2024 12:45:52 -0400 Subject: [PATCH 44/57] Add support tensor element type for register custom op shape infer function (#21387) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### Description Functionality extension for the SetOutputShape method in custom op shape inference. ### Motivation and Context - **SetOutputShape** Interface enhancement Actually, the shape infer function need set the tensor type and shape ,Add a parameter **type** to allow users to specify the tensor type, and set **ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT** as default value to ensure compatibility. Co-authored-by: mingyue --- include/onnxruntime/core/session/onnxruntime_cxx_api.h | 2 +- include/onnxruntime/core/session/onnxruntime_cxx_inline.h | 3 ++- onnxruntime/core/session/custom_ops.cc | 1 + 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/include/onnxruntime/core/session/onnxruntime_cxx_api.h b/include/onnxruntime/core/session/onnxruntime_cxx_api.h index 5d974e1ff5185..29a229f427163 100644 --- a/include/onnxruntime/core/session/onnxruntime_cxx_api.h +++ b/include/onnxruntime/core/session/onnxruntime_cxx_api.h @@ -2216,7 +2216,7 @@ struct ShapeInferContext { size_t GetInputCount() const { return input_shapes_.size(); } - Status SetOutputShape(size_t indice, const Shape& shape); + Status SetOutputShape(size_t indice, const Shape& shape, ONNXTensorElementDataType type = ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT); int64_t GetAttrInt(const char* attr_name); diff --git a/include/onnxruntime/core/session/onnxruntime_cxx_inline.h b/include/onnxruntime/core/session/onnxruntime_cxx_inline.h index aaef111b9f15b..9b9dd81a749c0 100644 --- a/include/onnxruntime/core/session/onnxruntime_cxx_inline.h +++ b/include/onnxruntime/core/session/onnxruntime_cxx_inline.h @@ -1998,9 +1998,10 @@ inline ShapeInferContext::ShapeInferContext(const OrtApi* ort_api, } } -inline Status ShapeInferContext::SetOutputShape(size_t indice, const Shape& shape) { +inline Status ShapeInferContext::SetOutputShape(size_t indice, const Shape& shape, ONNXTensorElementDataType type) { OrtTensorTypeAndShapeInfo* info = {}; ORT_CXX_RETURN_ON_API_FAIL(ort_api_->CreateTensorTypeAndShapeInfo(&info)); + ORT_CXX_RETURN_ON_API_FAIL(ort_api_->SetTensorElementType(info, type)); using InfoPtr = std::unique_ptr>; diff --git a/onnxruntime/core/session/custom_ops.cc b/onnxruntime/core/session/custom_ops.cc index 4c782f647371e..33d2a0244b453 100644 --- a/onnxruntime/core/session/custom_ops.cc +++ b/onnxruntime/core/session/custom_ops.cc @@ -105,6 +105,7 @@ struct OrtShapeInferContext { } } ONNX_NAMESPACE::updateOutputShape(ctx_, index, shape_proto); + ONNX_NAMESPACE::updateOutputElemType(ctx_, index, info->type); return onnxruntime::Status::OK(); } From 05cef469e81e3695667f122beecf97600094d09b Mon Sep 17 00:00:00 2001 From: Yi Zhang Date: Tue, 30 Jul 2024 00:59:46 +0800 Subject: [PATCH 45/57] Move on-device training packages publish step (#21539) ### Description Since the onedevice training cpu packaging has been a separated pipeline, it's nuget package publishing step must be moved as well. ### Motivation and Context Fixes the exception in Nuget Publishing Packaging Pipeline caused by #21485 --- .../c-api-training-packaging-pipelines.yml | 27 +++++++++++++++++-- .../github/azure-pipelines/publish-nuget.yml | 7 +---- 2 files changed, 26 insertions(+), 8 deletions(-) diff --git a/tools/ci_build/github/azure-pipelines/c-api-training-packaging-pipelines.yml b/tools/ci_build/github/azure-pipelines/c-api-training-packaging-pipelines.yml index aecece05a0e58..22ee7de8a5de0 100644 --- a/tools/ci_build/github/azure-pipelines/c-api-training-packaging-pipelines.yml +++ b/tools/ci_build/github/azure-pipelines/c-api-training-packaging-pipelines.yml @@ -32,13 +32,25 @@ parameters: displayName: Number added to pre-release package version. Only used if IsReleaseBuild is true. Denotes the sequence of a pre-release package. type: number default: 0 - + +# these 2 parameters are used for debugging. +- name: SpecificArtifact + displayName: Use Specific Artifact (Debugging only) + type: boolean + default: false + +- name: BuildId + displayName: Pipeline BuildId, you could find it in the URL + type: string + default: '0' + stages: - template: stages/set_packaging_variables_stage.yml parameters: IsReleaseBuild: ${{ parameters.IsReleaseBuild }} PreReleaseVersionSuffixString: ${{ parameters.PreReleaseVersionSuffixString }} PreReleaseVersionSuffixNumber: ${{ parameters.PreReleaseVersionSuffixNumber }} + - template: templates/ondevice-training-cpu-packaging-pipeline.yml parameters: RunOnnxRuntimeTests: ${{ parameters.RunOnnxRuntimeTests }} @@ -48,4 +60,15 @@ stages: OrtNugetPackageId: 'Microsoft.ML.OnnxRuntime.Training' AdditionalBuildFlags: '--enable_training_apis' AdditionalWinBuildFlags: '--enable_onnx_tests --enable_wcos' - BuildVariant: 'default' \ No newline at end of file + BuildVariant: 'default' + +- template: templates/publish-nuget-steps.yml + parameters: + download_artifacts_steps: + - template: templates/flex-downloadPipelineArtifact.yml + parameters: + StepName: 'Download Pipeline Artifact - Signed NuGet Training Package' + ArtifactName: 'drop-signed-nuget-Training-CPU' + targetPath: '$(Build.BinariesDirectory)/nuget-artifact/final-package' + SpecificArtifact: ${{ parameters.specificArtifact }} + BuildId: ${{ parameters.BuildId }} diff --git a/tools/ci_build/github/azure-pipelines/publish-nuget.yml b/tools/ci_build/github/azure-pipelines/publish-nuget.yml index 206a9464de6ef..b78d586288ba3 100644 --- a/tools/ci_build/github/azure-pipelines/publish-nuget.yml +++ b/tools/ci_build/github/azure-pipelines/publish-nuget.yml @@ -32,11 +32,6 @@ stages: artifact: 'drop-signed-nuget-dml' - script: move "$(Pipeline.Workspace)\build\drop-signed-nuget-dml\*" $(Build.BinariesDirectory)\nuget-artifact\final-package - - download: build - displayName: 'Download Pipeline Artifact - Signed NuGet Package' - artifact: 'drop-signed-nuget-Training-CPU' - - script: move "$(Pipeline.Workspace)\build\drop-signed-nuget-Training-CPU\*" $(Build.BinariesDirectory)\nuget-artifact\final-package - # Publish CUDA 11 Nuget/Java pkgs to ADO feed - template: stages/nuget-cuda-publishing-stage.yml parameters: @@ -44,4 +39,4 @@ stages: - template: stages/java-cuda-publishing-stage.yml parameters: - artifact_feed: $(ArtifactFeed) \ No newline at end of file + artifact_feed: $(ArtifactFeed) From bc3713206dc1d6c7e5062389ef7db42ac2051a30 Mon Sep 17 00:00:00 2001 From: Jian Chen Date: Mon, 29 Jul 2024 10:00:21 -0700 Subject: [PATCH 46/57] Update QNN pipeline pool (#21482) ### Description Update QNN pipeline pool ### Motivation and Context Let all our pipelines are using the latest NDK version --- ...droid-arm64-v8a-QNN-crosscompile-ci-pipeline.yml | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/tools/ci_build/github/azure-pipelines/android-arm64-v8a-QNN-crosscompile-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/android-arm64-v8a-QNN-crosscompile-ci-pipeline.yml index 6649206c0d79c..c80092fc82ed5 100644 --- a/tools/ci_build/github/azure-pipelines/android-arm64-v8a-QNN-crosscompile-ci-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/android-arm64-v8a-QNN-crosscompile-ci-pipeline.yml @@ -35,7 +35,7 @@ parameters: jobs: - job: Build_QNN_EP - pool: onnxruntime-qnn-ubuntu-2004-cpu + pool: onnxruntime-Ubuntu2204-AMD-CPU timeoutInMinutes: 30 workspace: clean: all @@ -46,6 +46,10 @@ jobs: inputs: versionSpec: $(pythonVersion) + - script: | + env | grep ANDROID + displayName: View Android ENVs + - script: sudo apt-get update -y && sudo apt-get install -y coreutils ninja-build displayName: Install coreutils and ninja @@ -56,13 +60,6 @@ jobs: parameters: QnnSDKVersion: ${{ parameters.QnnSdk }} - - script: | - export ANDROID_SDK_ROOT=/usr/local/lib/android/sdk - export ANDROID_HOME=/usr/local/lib/android/sdk - export ANDROID_NDK_HOME=/usr/local/lib/android/sdk/ndk-bundle - export ANDROID_NDK_ROOT=/usr/local/lib/android/sdk/ndk-bundle - displayName: set Android ENVs - - script: | set -e -x rm -rf /tmp/scripts From 79537d0523a7c215ef1685bf46efbd423242c4c1 Mon Sep 17 00:00:00 2001 From: Jian Chen Date: Mon, 29 Jul 2024 10:00:52 -0700 Subject: [PATCH 47/57] Remove tools/ci_build/github/android/run_nnapi_code_coverage.sh (#21371) ### Description Remove tools/ci_build/github/android/run_nnapi_code_coverage.sh ### Motivation and Context This file is no longer needed --- .../github/android/run_nnapi_code_coverage.sh | 36 ------------------- 1 file changed, 36 deletions(-) delete mode 100755 tools/ci_build/github/android/run_nnapi_code_coverage.sh diff --git a/tools/ci_build/github/android/run_nnapi_code_coverage.sh b/tools/ci_build/github/android/run_nnapi_code_coverage.sh deleted file mode 100755 index 472e824eaa47a..0000000000000 --- a/tools/ci_build/github/android/run_nnapi_code_coverage.sh +++ /dev/null @@ -1,36 +0,0 @@ -#!/bin/bash - -# This script will run ORT build for Android with code coverage option - -set -e -set -x - -if [ $# -ne 1 ]; then - echo "One command line argument, the ROOT root directory, is expected" -fi - -ORT_ROOT=$1 -# Build and run onnxruntime using NNAPI execution provider targeting android emulator -python3 ${ORT_ROOT}/tools/ci_build/build.py \ - --android \ - --build_dir build_nnapi \ - --android_sdk_path $ANDROID_HOME \ - --android_ndk_path $ANDROID_NDK_HOME \ - --android_abi=x86_64 \ - --android_api=29 \ - --skip_submodule_sync \ - --parallel \ - --use_nnapi \ - --cmake_generator=Ninja \ - --build_java \ - --path_to_protoc_exe $ORT_ROOT/protobuf_install/bin/protoc \ - --code_coverage - -# Install gcovr -python3 -m pip install gcovr - -# Retrieve runtime code coverage files from the emulator and analyze -python3 ${ORT_ROOT}/tools/ci_build/coverage.py \ - --build_dir build_nnapi \ - --android_sdk_path $ANDROID_HOME - From 0d7cf301a1e0ea784edcdf2242e973643f0bb9c9 Mon Sep 17 00:00:00 2001 From: Xu Xing Date: Tue, 30 Jul 2024 02:05:34 +0800 Subject: [PATCH 48/57] [js/webgpu] Add activation Tanh (#21540) Bug:https://github.com/microsoft/onnxruntime/issues/21467 ### Description ### Motivation and Context --- js/web/lib/wasm/jsep/webgpu/ops/fuse-utils.ts | 4 +++ js/web/test/data/ops/fused-conv.jsonc | 33 +++++++++++++++++++ 2 files changed, 37 insertions(+) diff --git a/js/web/lib/wasm/jsep/webgpu/ops/fuse-utils.ts b/js/web/lib/wasm/jsep/webgpu/ops/fuse-utils.ts index 6e66abacf3471..cfa0b42ef9eeb 100644 --- a/js/web/lib/wasm/jsep/webgpu/ops/fuse-utils.ts +++ b/js/web/lib/wasm/jsep/webgpu/ops/fuse-utils.ts @@ -30,6 +30,10 @@ export const getActivationSnippet = baseType}(uniforms.beta)));`; case 'LeakyRelu': return `value = select(${baseType}(uniforms.alpha) * value, value, value >= ${valueType}(0.0));`; + case 'Tanh': + return `let e2x = exp(-2.0 * abs(value)); + value = sign(value) * (1.0 - e2x) / (1.0 + e2x); + `; case '': return ''; // TODO: adding other activations that can be fused. diff --git a/js/web/test/data/ops/fused-conv.jsonc b/js/web/test/data/ops/fused-conv.jsonc index 6a10e3b96a26a..d88c91ebc9de7 100644 --- a/js/web/test/data/ops/fused-conv.jsonc +++ b/js/web/test/data/ops/fused-conv.jsonc @@ -430,5 +430,38 @@ ] } ] + }, + { + "name": "fused conv with tanh", + "operator": "FusedConv", + "attributes": [ + { "name": "activation", "data": "Tanh", "type": "string" }, + { "name": "kernel_shape", "data": [2, 2], "type": "ints" } + ], + "opset": { "domain": "com.microsoft", "version": 1 }, + "cases": [ + { + "name": "T[0]", + "inputs": [ + { + "data": [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9], + "dims": [1, 1, 3, 3], + "type": "float32" + }, + { + "data": [0.11, 0.12, 0.13, 0.14], + "dims": [1, 1, 2, 2], + "type": "float32" + } + ], + "outputs": [ + { + "data": [0.15572261810302734, 0.20409323275089264, 0.29770541191101074, 0.3425688147544861], + "dims": [1, 1, 2, 2], + "type": "float32" + } + ] + } + ] } ] From b03c9496aa081fa6c07c5b266800694c830afd60 Mon Sep 17 00:00:00 2001 From: Yulong Wang <7679871+fs-eire@users.noreply.github.com> Date: Mon, 29 Jul 2024 13:39:38 -0700 Subject: [PATCH 49/57] [js/web] allow load WebAssembly binary from buffer (#21534) ### Description This PR adds a new option `ort.env.wasm.wasmBinary`, which allows user to set to a buffer containing preload .wasm file content. This PR should resolve the problem from latest discussion in #20876. --- cmake/onnxruntime_webassembly.cmake | 2 +- js/common/lib/env.ts | 6 +++++ js/web/lib/wasm/wasm-factory.ts | 8 ++++++- .../e2e/browser-test-wasm-binary-override.js | 22 +++++++++++++++++++ js/web/test/e2e/run-data.js | 3 +++ 5 files changed, 39 insertions(+), 2 deletions(-) create mode 100644 js/web/test/e2e/browser-test-wasm-binary-override.js diff --git a/cmake/onnxruntime_webassembly.cmake b/cmake/onnxruntime_webassembly.cmake index 7a49e90c00bce..0686b66876d9f 100644 --- a/cmake/onnxruntime_webassembly.cmake +++ b/cmake/onnxruntime_webassembly.cmake @@ -225,7 +225,7 @@ else() "SHELL:-s EXPORT_ALL=0" "SHELL:-s VERBOSE=0" "SHELL:-s FILESYSTEM=0" - "SHELL:-s INCOMING_MODULE_JS_API=[preRun,locateFile,arguments,onExit,wasmMemory,buffer,instantiateWasm,mainScriptUrlOrBlob]" + "SHELL:-s INCOMING_MODULE_JS_API=[locateFile,instantiateWasm,wasmBinary]" "SHELL:-s WASM_BIGINT=1" ${WASM_API_EXCEPTION_CATCHING} --no-entry diff --git a/js/common/lib/env.ts b/js/common/lib/env.ts index dbb5f8118363f..1a87569a115a6 100644 --- a/js/common/lib/env.ts +++ b/js/common/lib/env.ts @@ -74,6 +74,12 @@ export declare namespace Env { */ wasmPaths?: WasmPrefixOrFilePaths; + /** + * Set a custom buffer which contains the WebAssembly binary. If this property is set, the `wasmPaths` property will + * be ignored. + */ + wasmBinary?: ArrayBufferLike|Uint8Array; + /** * Set or get a boolean value indicating whether to proxy the execution of main thread to a worker thread. * diff --git a/js/web/lib/wasm/wasm-factory.ts b/js/web/lib/wasm/wasm-factory.ts index fb068ab42d04c..0f5f10716a00b 100644 --- a/js/web/lib/wasm/wasm-factory.ts +++ b/js/web/lib/wasm/wasm-factory.ts @@ -108,6 +108,7 @@ export const initializeWebAssembly = async(flags: Env.WebAssemblyFlags): Promise const mjsPathOverride = (mjsPathOverrideFlag as URL)?.href ?? mjsPathOverrideFlag; const wasmPathOverrideFlag = (wasmPaths as Env.WasmFilePaths)?.wasm; const wasmPathOverride = (wasmPathOverrideFlag as URL)?.href ?? wasmPathOverrideFlag; + const wasmBinaryOverride = flags.wasmBinary; const [objectUrl, ortWasmFactory] = (await importWasmModule(mjsPathOverride, wasmPrefixOverride, numThreads > 1)); @@ -135,7 +136,12 @@ export const initializeWebAssembly = async(flags: Env.WebAssemblyFlags): Promise numThreads, }; - if (wasmPathOverride || wasmPrefixOverride) { + if (wasmBinaryOverride) { + /** + * Set a custom buffer which contains the WebAssembly binary. This will skip the wasm file fetching. + */ + config.wasmBinary = wasmBinaryOverride; + } else if (wasmPathOverride || wasmPrefixOverride) { /** * A callback function to locate the WebAssembly file. The function should return the full path of the file. * diff --git a/js/web/test/e2e/browser-test-wasm-binary-override.js b/js/web/test/e2e/browser-test-wasm-binary-override.js new file mode 100644 index 0000000000000..35d427fa3b722 --- /dev/null +++ b/js/web/test/e2e/browser-test-wasm-binary-override.js @@ -0,0 +1,22 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +'use strict'; + +const documentUrl = document.currentScript.src; + +it('Browser E2E testing - WebAssembly backend', async function() { + // preload .wasm file binary + const wasmUrl = new URL('./node_modules/onnxruntime-web/dist/ort-wasm-simd-threaded.wasm', documentUrl).href; + const response = await fetch(wasmUrl); + + // make sure the .wasm file is loaded successfully + assert(response.ok); + assert(response.headers.get('Content-Type') === 'application/wasm'); + + // override wasm binary + const binary = await response.arrayBuffer(); + ort.env.wasm.wasmBinary = binary; + + await testFunction(ort, {executionProviders: ['wasm']}); +}); diff --git a/js/web/test/e2e/run-data.js b/js/web/test/e2e/run-data.js index 507192f29be9c..856f29eac6ddf 100644 --- a/js/web/test/e2e/run-data.js +++ b/js/web/test/e2e/run-data.js @@ -36,6 +36,9 @@ const BROWSER_TEST_CASES = [ [true, false, './browser-test-wasm.js', 'ort.bundle.min.mjs', ['num_threads=2', 'proxy=1']], // 2 threads, proxy [true, false, './browser-test-wasm.js', 'ort.bundle.min.mjs', ['num_threads=1', 'proxy=1']], // 1 thread, proxy + // wasm binary override: + [true, false, './browser-test-wasm-binary-override.js', 'ort.min.js'], + // path override: // wasm, path override filenames for both mjs and wasm, same origin [true, false, './browser-test-wasm-path-override-filename.js', 'ort.min.js', ['port=9876', 'files=mjs,wasm']], From c39f1c4fd80668fd7619719ebe7a374f4ae11a5e Mon Sep 17 00:00:00 2001 From: Preetha Veeramalai Date: Mon, 29 Jul 2024 14:12:36 -0700 Subject: [PATCH 50/57] ORT- OVEP 1.19 PR-follow up (#21546) ### Description Follow up PR for bug fixes on 1.19 ### Motivation and Context - Handles 1.19 docker file fixes. - Sets the default file naming of epctx onnx model with _ctx.onnx as suffix. - Create epctx model directories if it doesn't exist. --------- Co-authored-by: jatinwadhwa921 <110383850+jatinwadhwa921@users.noreply.github.com> --- dockerfiles/Dockerfile.openvino | 10 ++++------ .../providers/openvino/backend_manager.cc | 9 ++++++++- .../openvino/openvino_execution_provider.cc | 5 ----- .../openvino/openvino_provider_factory.cc | 20 ++++++++++++++++++- 4 files changed, 31 insertions(+), 13 deletions(-) diff --git a/dockerfiles/Dockerfile.openvino b/dockerfiles/Dockerfile.openvino index 75898770acf28..39e75a68a369f 100644 --- a/dockerfiles/Dockerfile.openvino +++ b/dockerfiles/Dockerfile.openvino @@ -3,11 +3,11 @@ # SPDX-License-Identifier: MIT #-------------------------------------------------------------------------- -ARG OPENVINO_VERSION=2024.0.0 +ARG OPENVINO_VERSION=2024.2.0 # Build stage -FROM openvino/ubuntu20_runtime:${OPENVINO_VERSION} AS builder +FROM openvino/ubuntu22_runtime:${OPENVINO_VERSION} AS builder ENV WORKDIR_PATH=/home/openvino WORKDIR $WORKDIR_PATH @@ -34,20 +34,18 @@ RUN cat /etc/apt/sources.list | sed 's/^# deb-src/deb-src/g' > ./temp; mv temp / RUN apt update; apt install dpkg-dev RUN mkdir /sources WORKDIR /sources -RUN apt-get source cron iso-codes lsb-release powermgmt-base python-apt-common python3-apt python3-dbus python3-gi unattended-upgrades libapt-pkg6.0 libhogweed5 libnettle7 +RUN apt-get source cron iso-codes lsb-release powermgmt-base python-apt-common python3-apt python3-dbus python3-gi libapt-pkg6.0 libhogweed6 libnettle8 WORKDIR / RUN tar cvf GPL_sources.tar.gz /sources # Deploy stage -FROM openvino/ubuntu20_runtime:${OPENVINO_VERSION} +FROM openvino/ubuntu22_runtime:${OPENVINO_VERSION} ENV DEBIAN_FRONTEND noninteractive USER root COPY --from=builder /home/openvino/onnxruntime/build/Linux/Release/dist/*.whl ./ COPY --from=builder /GPL_sources.tar.gz ./ RUN python3 -m pip install ./*.whl && rm ./*.whl -RUN apt update; apt install -y unattended-upgrades && \ - unattended-upgrade ARG BUILD_UID=1001 ARG BUILD_USER=onnxruntimedev RUN adduser --uid $BUILD_UID $BUILD_USER diff --git a/onnxruntime/core/providers/openvino/backend_manager.cc b/onnxruntime/core/providers/openvino/backend_manager.cc index 8f3658df0d09d..18a6257910a56 100644 --- a/onnxruntime/core/providers/openvino/backend_manager.cc +++ b/onnxruntime/core/providers/openvino/backend_manager.cc @@ -128,6 +128,13 @@ BackendManager::BackendManager(const GlobalContext& global_context, #endif } } + if (global_context_.export_ep_ctx_blob && !ep_ctx_handle_.IsValidOVEPCtxGraph()) { + auto status = onnxruntime::openvino_ep::BackendManager::ExportCompiledBlobAsEPCtxNode(subgraph, + logger); + if ((!status.IsOK())) { + ORT_THROW(status); + } + } } // Call EPContext model exporter here if the provider option for exporting @@ -158,7 +165,7 @@ Status BackendManager::ExportCompiledBlobAsEPCtxNode(const onnxruntime::GraphVie if (dot == std::string::npos) return graph_name; return graph_name.substr(0, dot); }(); - graph_name = graph_name + "-ov_" + GetGlobalContext().device_type + "_blob.onnx"; + graph_name = graph_name + "_ctx.onnx"; } // If embed_mode, then pass on the serialized blob // If not embed_mode, dump the blob here and only pass on the path to the blob diff --git a/onnxruntime/core/providers/openvino/openvino_execution_provider.cc b/onnxruntime/core/providers/openvino/openvino_execution_provider.cc index 5627cb2c122fb..29c45916795d3 100644 --- a/onnxruntime/core/providers/openvino/openvino_execution_provider.cc +++ b/onnxruntime/core/providers/openvino/openvino_execution_provider.cc @@ -147,11 +147,6 @@ common::Status OpenVINOExecutionProvider::Compile( *GetLogger(), ep_ctx_handle_); - if (global_context_->export_ep_ctx_blob && !ep_ctx_handle_.IsValidOVEPCtxGraph()) { - ORT_RETURN_IF_ERROR(backend_manager->ExportCompiledBlobAsEPCtxNode(graph_body_viewer, - *GetLogger())); - } - compute_info.create_state_func = [backend_manager](ComputeContext* context, FunctionState* state) { OpenVINOEPFunctionState* p = new OpenVINOEPFunctionState(); diff --git a/onnxruntime/core/providers/openvino/openvino_provider_factory.cc b/onnxruntime/core/providers/openvino/openvino_provider_factory.cc index 716a7cd936405..3738f2a534154 100644 --- a/onnxruntime/core/providers/openvino/openvino_provider_factory.cc +++ b/onnxruntime/core/providers/openvino/openvino_provider_factory.cc @@ -192,6 +192,10 @@ struct OpenVINO_Provider : Provider { } if (provider_options_map.find("num_of_threads") != provider_options_map.end()) { + if (!std::all_of(provider_options_map.at("num_of_threads").begin(), + provider_options_map.at("num_of_threads").end(), ::isdigit)) { + ORT_THROW("[ERROR] [OpenVINO-EP] Number of threads should be a number. \n"); + } num_of_threads = std::stoi(provider_options_map.at("num_of_threads")); if (num_of_threads <= 0) { num_of_threads = 1; @@ -298,7 +302,21 @@ struct OpenVINO_Provider : Provider { // The path to dump epctx model is valid only when epctx is enabled. // Overrides the cache_dir option to dump model cache files from OV. if (export_ep_ctx_blob) { - cache_dir = provider_options_map.at("so_epctx_path").c_str(); + auto ep_context_file_path_ = provider_options_map.at("so_epctx_path"); + auto file_path = std::filesystem::path(ep_context_file_path_); + // ep_context_file_path_ file extension must be .onnx + if (!ep_context_file_path_.empty() && + file_path.extension().generic_string() == ".onnx") { + // ep_context_file_path_ must be provided as a directory, create it if doesn't exist + auto parent_path = file_path.parent_path(); + if (!std::filesystem::is_directory(parent_path) && + !std::filesystem::create_directory(parent_path)) { + ORT_THROW("[ERROR] [OpenVINO] Failed to create directory : " + file_path.parent_path().generic_string() + " \n"); + } + cache_dir = ep_context_file_path_.c_str(); + } else { + ORT_THROW("[ERROR] [OpenVINO] Invalid ep_ctx_file_path" + ep_context_file_path_ + " \n"); + } } } From 7543dd040b2d32109a2718d7276d3aca1edadaae Mon Sep 17 00:00:00 2001 From: Adam Reeve Date: Tue, 30 Jul 2024 10:50:13 +1200 Subject: [PATCH 51/57] Propagate NaNs in the CPU min and max operators (#21492) ### Description Propagates NaN values in the min and max operators so that min or max with a NaN in either input always produces NaN. ### Motivation and Context Fixes #21455 --- .../providers/cpu/math/element_wise_ops.cc | 18 +- onnxruntime/test/providers/checkers.cc | 2 +- .../cpu/math/element_wise_ops_test.cc | 188 ++++++++++++++++-- 3 files changed, 187 insertions(+), 21 deletions(-) diff --git a/onnxruntime/core/providers/cpu/math/element_wise_ops.cc b/onnxruntime/core/providers/cpu/math/element_wise_ops.cc index 1d524a90302e7..5ea6000da1cba 100644 --- a/onnxruntime/core/providers/cpu/math/element_wise_ops.cc +++ b/onnxruntime/core/providers/cpu/math/element_wise_ops.cc @@ -705,7 +705,7 @@ Status Min_6::Compute(OpKernelContext* ctx) const { for (int index = 1; index < inputCount; index++) { auto& data_n = *ctx->Input(index); ORT_ENFORCE(data_n.Shape() == shape, "All inputs must have the same shape"); - min = min.array().min(EigenMap(data_n).array()); + min = min.array().template min(EigenMap(data_n).array()); } return Status::OK(); @@ -721,15 +721,16 @@ struct Min_8::ComputeImpl { ProcessBroadcastSpanFuncs funcs{ [](BroadcastHelper& per_iter_bh) { per_iter_bh.OutputEigen() = - per_iter_bh.EigenInput1().array().min(per_iter_bh.ScalarInput0()); + per_iter_bh.EigenInput1().array().template min(per_iter_bh.ScalarInput0()); }, [](BroadcastHelper& per_iter_bh) { per_iter_bh.OutputEigen() = - per_iter_bh.EigenInput0().array().min(per_iter_bh.ScalarInput1()); + per_iter_bh.EigenInput0().array().template min(per_iter_bh.ScalarInput1()); }, [](BroadcastHelper& per_iter_bh) { per_iter_bh.OutputEigen() = - per_iter_bh.EigenInput0().array().min(per_iter_bh.EigenInput1().array()); + per_iter_bh.EigenInput0().array().template min( + per_iter_bh.EigenInput1().array()); }}; int input_count = inst.Node().InputArgCount().front(); @@ -827,7 +828,7 @@ Status Max_6::Compute(OpKernelContext* ctx) const { for (int index = 1; index < inputCount; index++) { auto& data_n = *ctx->Input(index); ORT_ENFORCE(data_n.Shape() == shape, "All inputs must have the same shape"); - max = max.array().max(EigenMap(data_n).array()); + max = max.array().template max(EigenMap(data_n).array()); } return Status::OK(); @@ -843,15 +844,16 @@ struct Max_8::ComputeImpl { ProcessBroadcastSpanFuncs funcs{ [](BroadcastHelper& per_iter_bh) { per_iter_bh.OutputEigen() = - per_iter_bh.EigenInput1().array().max(per_iter_bh.ScalarInput0()); + per_iter_bh.EigenInput1().array().template max(per_iter_bh.ScalarInput0()); }, [](BroadcastHelper& per_iter_bh) { per_iter_bh.OutputEigen() = - per_iter_bh.EigenInput0().array().max(per_iter_bh.ScalarInput1()); + per_iter_bh.EigenInput0().array().template max(per_iter_bh.ScalarInput1()); }, [](BroadcastHelper& per_iter_bh) { per_iter_bh.OutputEigen() = - per_iter_bh.EigenInput0().array().max(per_iter_bh.EigenInput1().array()); + per_iter_bh.EigenInput0().array().template max( + per_iter_bh.EigenInput1().array()); }}; int input_count = inst.Node().InputArgCount().front(); diff --git a/onnxruntime/test/providers/checkers.cc b/onnxruntime/test/providers/checkers.cc index 5f332ddcddb8d..182fa4729a88f 100644 --- a/onnxruntime/test/providers/checkers.cc +++ b/onnxruntime/test/providers/checkers.cc @@ -427,7 +427,7 @@ struct TensorCheck { for (int64_t i = 0; i < size; ++i) { if (std::isnan(f_expected[i])) { - EXPECT_TRUE(std::isnan(f_expected[i])) << "Expected NaN. i:" << i; + EXPECT_TRUE(std::isnan(f_actual[i])) << "Expected NaN. i:" << i; } else if (std::isinf(f_expected[i])) { // Test infinity for equality EXPECT_EQ(f_expected[i], f_actual[i]) << "Expected infinity. i:" << i; } else { diff --git a/onnxruntime/test/providers/cpu/math/element_wise_ops_test.cc b/onnxruntime/test/providers/cpu/math/element_wise_ops_test.cc index eb3575f2cde88..bd3d21d4929f3 100644 --- a/onnxruntime/test/providers/cpu/math/element_wise_ops_test.cc +++ b/onnxruntime/test/providers/cpu/math/element_wise_ops_test.cc @@ -1553,6 +1553,47 @@ TEST(MathOpTest, Min_12_Float_Nan) { } } +TEST(MathOpTest, Min_12_Float_Nan_with_scalar) { + OpTester test("Min", 12); + test.AddInput("data_1", {3, 1}, + {std::numeric_limits::quiet_NaN(), -0.5f, 0.5f}); + test.AddInput("data_2", {1}, {0.25f}); + test.AddOutput("min", {3, 1}, + {std::numeric_limits::quiet_NaN(), -0.5f, 0.25f}); + if (nullptr != DefaultCpuExecutionProvider()) { + std::vector> execution_providers; + execution_providers.push_back(DefaultCpuExecutionProvider()); + test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers); + } + if (nullptr != DefaultCudaExecutionProvider()) { + std::vector> execution_providers; + execution_providers.push_back(DefaultCudaExecutionProvider()); + test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers); + } +} + +TEST(MathOpTest, Min_12_Float_with_scalar_Nan) { + OpTester test("Min", 12); + test.AddInput("data_1", {2, 2}, + {0.25f, -0.25f, -0.5f, 0.5f}); + test.AddInput("data_2", {1}, {std::numeric_limits::quiet_NaN()}); + test.AddOutput("min", {2, 2}, + {std::numeric_limits::quiet_NaN(), + std::numeric_limits::quiet_NaN(), + std::numeric_limits::quiet_NaN(), + std::numeric_limits::quiet_NaN()}); + if (nullptr != DefaultCpuExecutionProvider()) { + std::vector> execution_providers; + execution_providers.push_back(DefaultCpuExecutionProvider()); + test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers); + } + if (nullptr != DefaultCudaExecutionProvider()) { + std::vector> execution_providers; + execution_providers.push_back(DefaultCudaExecutionProvider()); + test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers); + } +} + TEST(MathOpTest, Min_12_Double) { OpTester test("Min", 12); test.AddInput("data_0", {1, 3}, @@ -1586,12 +1627,53 @@ TEST(MathOpTest, Min_12_Double_Nan) { std::numeric_limits::quiet_NaN(), -1.0, -1.0, -2.0, 0.5, 0.0, 1.0}); - if (nullptr != DefaultCpuExecutionProvider().get()) { + if (nullptr != DefaultCpuExecutionProvider()) { std::vector> execution_providers; execution_providers.push_back(DefaultCpuExecutionProvider()); test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers); } - if (nullptr != DefaultCudaExecutionProvider().get()) { + if (nullptr != DefaultCudaExecutionProvider()) { + std::vector> execution_providers; + execution_providers.push_back(DefaultCudaExecutionProvider()); + test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers); + } +} + +TEST(MathOpTest, Min_12_Double_Nan_with_scalar) { + OpTester test("Min", 12); + test.AddInput("data_1", {3, 1}, + {std::numeric_limits::quiet_NaN(), -0.5, 0.5}); + test.AddInput("data_2", {1}, {0.25}); + test.AddOutput("min", {3, 1}, + {std::numeric_limits::quiet_NaN(), -0.5, 0.25}); + if (nullptr != DefaultCpuExecutionProvider()) { + std::vector> execution_providers; + execution_providers.push_back(DefaultCpuExecutionProvider()); + test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers); + } + if (nullptr != DefaultCudaExecutionProvider()) { + std::vector> execution_providers; + execution_providers.push_back(DefaultCudaExecutionProvider()); + test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers); + } +} + +TEST(MathOpTest, Min_12_Double_with_scalar_Nan) { + OpTester test("Min", 12); + test.AddInput("data_1", {2, 2}, + {0.25, -0.25, -0.5, 0.5}); + test.AddInput("data_2", {1}, {std::numeric_limits::quiet_NaN()}); + test.AddOutput("min", {2, 2}, + {std::numeric_limits::quiet_NaN(), + std::numeric_limits::quiet_NaN(), + std::numeric_limits::quiet_NaN(), + std::numeric_limits::quiet_NaN()}); + if (nullptr != DefaultCpuExecutionProvider()) { + std::vector> execution_providers; + execution_providers.push_back(DefaultCpuExecutionProvider()); + test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers); + } + if (nullptr != DefaultCudaExecutionProvider()) { std::vector> execution_providers; execution_providers.push_back(DefaultCudaExecutionProvider()); test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers); @@ -1666,7 +1748,7 @@ TEST(MathOpTest, Min_12_UInt64) { test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider}); // TensorRT: Input batch size is inconsistent } -TEST(MathOpTest, Min_12_MLFLoat16) { +TEST(MathOpTest, Min_12_MLFloat16) { OpTester test("Min", 12); test.AddInput("data_0", {1, 3}, MakeMLFloat16({1.f, 1.f, 1.f})); @@ -1679,7 +1761,7 @@ TEST(MathOpTest, Min_12_MLFLoat16) { test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider}); // TensorRT: Input batch size is inconsistent } -TEST(MathOpTest, Min_12_MLFLoat16_Scalar0) { +TEST(MathOpTest, Min_12_MLFloat16_Scalar0) { OpTester test("Min", 12); test.AddInput("data_0", {}, MakeMLFloat16({-10.f})); @@ -1692,7 +1774,7 @@ TEST(MathOpTest, Min_12_MLFLoat16_Scalar0) { test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider}); // TensorRT: Input batch size is inconsistent } -TEST(MathOpTest, Min_12_MLFLoat16_Scalar1) { +TEST(MathOpTest, Min_12_MLFloat16_Scalar1) { OpTester test("Min", 12); test.AddInput("data_0", {1, 3}, MakeMLFloat16({2.f, 3.f, 4.f})); @@ -1809,12 +1891,53 @@ TEST(MathOpTest, Max_12_Float_Nan) { std::numeric_limits::quiet_NaN(), -0.5f, 0.0f, -1.0f, 1.0f, 1.0f, 2.0f}); - if (nullptr != DefaultCpuExecutionProvider().get()) { + if (nullptr != DefaultCpuExecutionProvider()) { std::vector> execution_providers; execution_providers.push_back(DefaultCpuExecutionProvider()); test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers); } - if (nullptr != DefaultCudaExecutionProvider().get()) { + if (nullptr != DefaultCudaExecutionProvider()) { + std::vector> execution_providers; + execution_providers.push_back(DefaultCudaExecutionProvider()); + test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers); + } +} + +TEST(MathOpTest, Max_12_Float_Nan_with_scalar) { + OpTester test("Max", 12); + test.AddInput("data_1", {3, 1}, + {std::numeric_limits::quiet_NaN(), -0.5f, 0.5f}); + test.AddInput("data_2", {1}, {0.25f}); + test.AddOutput("max", {3, 1}, + {std::numeric_limits::quiet_NaN(), 0.25f, 0.5f}); + if (nullptr != DefaultCpuExecutionProvider()) { + std::vector> execution_providers; + execution_providers.push_back(DefaultCpuExecutionProvider()); + test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers); + } + if (nullptr != DefaultCudaExecutionProvider()) { + std::vector> execution_providers; + execution_providers.push_back(DefaultCudaExecutionProvider()); + test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers); + } +} + +TEST(MathOpTest, Max_12_Float_with_scalar_Nan) { + OpTester test("Max", 12); + test.AddInput("data_1", {2, 2}, + {0.25f, -0.25f, -0.5f, 0.5f}); + test.AddInput("data_2", {1}, {std::numeric_limits::quiet_NaN()}); + test.AddOutput("max", {2, 2}, + {std::numeric_limits::quiet_NaN(), + std::numeric_limits::quiet_NaN(), + std::numeric_limits::quiet_NaN(), + std::numeric_limits::quiet_NaN()}); + if (nullptr != DefaultCpuExecutionProvider()) { + std::vector> execution_providers; + execution_providers.push_back(DefaultCpuExecutionProvider()); + test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers); + } + if (nullptr != DefaultCudaExecutionProvider()) { std::vector> execution_providers; execution_providers.push_back(DefaultCudaExecutionProvider()); test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers); @@ -1854,12 +1977,53 @@ TEST(MathOpTest, Max_12_Double_Nan) { std::numeric_limits::quiet_NaN(), -0.5, 0.0, -1.0, 1.0, 1.0, 2.0}); - if (nullptr != DefaultCpuExecutionProvider().get()) { + if (nullptr != DefaultCpuExecutionProvider()) { std::vector> execution_providers; execution_providers.push_back(DefaultCpuExecutionProvider()); test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers); } - if (nullptr != DefaultCudaExecutionProvider().get()) { + if (nullptr != DefaultCudaExecutionProvider()) { + std::vector> execution_providers; + execution_providers.push_back(DefaultCudaExecutionProvider()); + test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers); + } +} + +TEST(MathOpTest, Max_12_Double_Nan_with_scalar) { + OpTester test("Max", 12); + test.AddInput("data_1", {3, 1}, + {std::numeric_limits::quiet_NaN(), -0.5, 0.5}); + test.AddInput("data_2", {1}, {0.25}); + test.AddOutput("max", {3, 1}, + {std::numeric_limits::quiet_NaN(), 0.25, 0.5}); + if (nullptr != DefaultCpuExecutionProvider()) { + std::vector> execution_providers; + execution_providers.push_back(DefaultCpuExecutionProvider()); + test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers); + } + if (nullptr != DefaultCudaExecutionProvider()) { + std::vector> execution_providers; + execution_providers.push_back(DefaultCudaExecutionProvider()); + test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers); + } +} + +TEST(MathOpTest, Max_12_Double_with_scalar_Nan) { + OpTester test("Max", 12); + test.AddInput("data_1", {2, 2}, + {0.25, -0.25, -0.5, 0.5}); + test.AddInput("data_2", {1}, {std::numeric_limits::quiet_NaN()}); + test.AddOutput("max", {2, 2}, + {std::numeric_limits::quiet_NaN(), + std::numeric_limits::quiet_NaN(), + std::numeric_limits::quiet_NaN(), + std::numeric_limits::quiet_NaN()}); + if (nullptr != DefaultCpuExecutionProvider()) { + std::vector> execution_providers; + execution_providers.push_back(DefaultCpuExecutionProvider()); + test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers); + } + if (nullptr != DefaultCudaExecutionProvider()) { std::vector> execution_providers; execution_providers.push_back(DefaultCudaExecutionProvider()); test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers); @@ -1934,7 +2098,7 @@ TEST(MathOpTest, Max_12_UInt64) { test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider}); // TensorRT: Input batch size is inconsistent } -TEST(MathOpTest, Max_12_MLFLoat16) { +TEST(MathOpTest, Max_12_MLFloat16) { OpTester test("Max", 12); test.AddInput("data_0", {1, 3}, MakeMLFloat16({-1.f, -1.f, -1.f})); @@ -1947,7 +2111,7 @@ TEST(MathOpTest, Max_12_MLFLoat16) { test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider}); // TensorRT: Input batch size is inconsistent } -TEST(MathOpTest, Max_12_MLFLoat16_Scalar0) { +TEST(MathOpTest, Max_12_MLFloat16_Scalar0) { OpTester test("Max", 12); test.AddInput("data_0", {}, MakeMLFloat16({-1.f})); @@ -1960,7 +2124,7 @@ TEST(MathOpTest, Max_12_MLFLoat16_Scalar0) { test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider}); // TensorRT: Input batch size is inconsistent } -TEST(MathOpTest, Max_12_MLFLoat16_Scalar1) { +TEST(MathOpTest, Max_12_MLFloat16_Scalar1) { OpTester test("Max", 12); test.AddInput("data_0", {1, 3}, MakeMLFloat16({-1.f, -2.f, -3.f})); From d98581495f996084af65ae1e6600378bed949460 Mon Sep 17 00:00:00 2001 From: Sophie Schoenmeyer <107952697+sophies927@users.noreply.github.com> Date: Mon, 29 Jul 2024 16:06:03 -0700 Subject: [PATCH 52/57] Update labeling bot (#21548) Current labeling bot over-applies many of the labels (e.g., ep:CUDA and platform:windows) and is missing some of the APIs + EPs Working on migrating this workflow to GitHub policies but would like to use this fix in the meantime to avoid causing any issues w/ ORT 1.19 ### Description ### Motivation and Context --- .github/labeler.yml | 31 ++++++++++++++---------- .github/title-only-labeler.yml | 4 +++ .github/workflows/title-only-labeler.yml | 20 +++++++++++++++ 3 files changed, 42 insertions(+), 13 deletions(-) create mode 100644 .github/title-only-labeler.yml create mode 100644 .github/workflows/title-only-labeler.yml diff --git a/.github/labeler.yml b/.github/labeler.yml index 526d8a643e713..c14e2a213bc60 100644 --- a/.github/labeler.yml +++ b/.github/labeler.yml @@ -1,20 +1,25 @@ -api:javascript: '/\bjavascript\b/i' +api:CSharp: '/(\bc\s*sharp\b|\bc#)/i' api:java: '/\bjava\b/i' +api:javascript: '/\bjavascript\b/i' ep:ACL: '/\bacl\b/i' ep:ArmNN: '/\barmnn\b/i' -ep:CUDA: '/\bcuda\b/i' -ep:DML: '/(\bdirectml\b|\bdml\b)/i' -ep:MIGraphX: '/\bmigraphx\b/i' -ep:oneDNN: '/\bonednn\b/i' +ep:CANN: '/\bcann\b/i' +ep:CoreML: '/\bcore\s*ml\b/i' +ep:DML: '/(\bdirect\s*ml\b|\bdml\b)/i' +ep:MIGraphX: '/\bmi\s*graph\s*x\b/i' +ep:oneDNN: '/\bone\s*dnn\b/i' ep:OpenVINO: '/\bopen\s*vino\b/i' -ep:RockchipNPU: '/\brockchip\b/i' +ep:QNN: '/\bqnn\b/i' +ep:RockchipNPU: '/\brockchip(?:npu)?\b/i' ep:ROCm: '/\brocm\b/i' -ep:TensorRT: '/(\btensor\s*rt\b|\btrt\b)/i' +ep:SNPE: '/\bsnpe\b/i' ep:tvm: '/\btvm\b/i' ep:VitisAI: '/\bvitis(?:ai)?\b/i' -platform:jetson: '/\bjetson\b/i' -platform:mobile: '/(\bobj(?:ective)?-?c\b|\bnnapi\b|\bcore-?ml\b|\bmobile\b|\bandroid\b|\bios\b|\bxamarin\b|\bmaui\b)/i' -platform:web: '/(\bwebgl\b|\bweb-?gpu\b|\bwasm\b|\bonnxruntime-node\b|\bonnxruntime-web\b)/i' -platform:windows: '/(\bwindows\b|\bwinrt\b|\bwinml\b)/i' -model:transformer: '/(\bbert\b|\bgpt-?2\b|\bhugging-?face\b|\blong-?former\b|\bt5\b)/i' -quantization: '/(is this a quantized model\?\n\nYes|\bquantization\b)/i' +ep:WebGPU: '/\bwebgpu\b/i' +ep:WebNN: '/\bwebnn\b/i' +ep:Xnnpack: '/\bxnn\s*pack\b/i' +.NET: '/(\bdot\s*net\b|\bnuget\b|\.net\b)/i' +platform:jetson: '/(\bjetson\b|\bjetpack\b)/i' +platform:mobile: '/(\bobj(?:ective)?-?c\b|\bnnapi\b|\bmobile\b|\bandroid\b|\bios\b|\bxamarin\b|\bmaui\b)/i' +platform:web: '/(\bwebgl\b|\bweb-?gpu\b|\bwasm\b|\bonnxruntime-node\b|\bonnxruntime-web\b|\bonnxruntime-react-native\b|\bnpm\b|\btransformers\.js\b)/i' +model:transformer: '/\btransformers(?!\.js)\b/i' diff --git a/.github/title-only-labeler.yml b/.github/title-only-labeler.yml new file mode 100644 index 0000000000000..4980f7251bcb4 --- /dev/null +++ b/.github/title-only-labeler.yml @@ -0,0 +1,4 @@ +ep:CUDA: '/\bcuda\b/i' +ep:TensorRT: '/(\btensor\s*rt\b|\btrt\b)/i' +platform:windows: '/(\bwindows\b|\bwinrt\b|\bwinml\b)/i' +quantization: '/(quant|\bqdq\b)/i' diff --git a/.github/workflows/title-only-labeler.yml b/.github/workflows/title-only-labeler.yml new file mode 100644 index 0000000000000..e0af2dd06b1b7 --- /dev/null +++ b/.github/workflows/title-only-labeler.yml @@ -0,0 +1,20 @@ +name: "Title Only Issue Labeler" +on: + issues: + types: [opened, edited] + +permissions: + issues: write + +jobs: + triage: + runs-on: ubuntu-latest + steps: + - uses: github/issue-labeler@v3.4 + with: + repo-token: "${{ secrets.GITHUB_TOKEN }}" + configuration-path: .github/title-only-labeler.yml + not-before: 2020-01-15T02:54:32Z + enable-versioned-regex: 0 + include-title: 1 + include-body: 0 From 8417c325ec160dc8ee62edaf6d1daf91ad979d56 Mon Sep 17 00:00:00 2001 From: mcollinswisc Date: Mon, 29 Jul 2024 16:06:51 -0700 Subject: [PATCH 53/57] Keep QDQ nodes w/ nonpositive scale around MaxPool (#21182) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### Description This change adds a check for whether the scale in the QuantizeLinear (or DequantizeLinear) is a positive scalar, and a new selector to disallow removing the QDQ around MaxPool if it is not. ### Motivation and Context Currently, the DropQDQNodesRules optimization removes QuantizeLinear and DequantizeLinear nodes from DequantizeLinear ∘ MaxPool ∘ QuantizeLinear. However, if the x_scale/y_scale values are non-positive, the (de-)quantization changes the ordering of the elements in the input value, so this optimization is changing the results. https://github.com/microsoft/onnxruntime/issues/21176 --------- Co-authored-by: Edward Chen <18449977+edgchen1@users.noreply.github.com> --- .../optimizer/qdq_transformer/qdq_util.cc | 35 ++++++++++++++ .../core/optimizer/qdq_transformer/qdq_util.h | 4 ++ .../qdq_selector_action_transformer.cc | 27 +++++++++-- .../selectors_actions/qdq_selectors.cc | 7 +++ .../selectors_actions/qdq_selectors.h | 10 ++-- .../test/optimizer/qdq_transformer_test.cc | 46 +++++++++++++++++++ 6 files changed, 120 insertions(+), 9 deletions(-) diff --git a/onnxruntime/core/optimizer/qdq_transformer/qdq_util.cc b/onnxruntime/core/optimizer/qdq_transformer/qdq_util.cc index a4d1ea3c7cf56..7ef4ced1835f0 100644 --- a/onnxruntime/core/optimizer/qdq_transformer/qdq_util.cc +++ b/onnxruntime/core/optimizer/qdq_transformer/qdq_util.cc @@ -166,6 +166,41 @@ bool QOrDQNodeHasConstantScalarScaleAndZeroPoint( return true; } +bool IsQOrDQScalePositiveConstantScalar( + const Node& q_or_dq_node, const GetConstantInitializerFn& get_const_initializer, + const std::filesystem::path& model_path) { + auto q_or_dq_input_defs = q_or_dq_node.InputDefs(); + + ORT_ENFORCE(q_or_dq_input_defs.size() >= 2); + + if (!optimizer_utils::IsScalar(*q_or_dq_input_defs[InputIndex::SCALE_ID])) { + return false; + } + + const ONNX_NAMESPACE::TensorProto* q_or_dq_scale_tensor_proto = + get_const_initializer(q_or_dq_input_defs[InputIndex::SCALE_ID]->Name()); + if (nullptr == q_or_dq_scale_tensor_proto) { + return false; + } + + Initializer q_or_dq_scale(*q_or_dq_scale_tensor_proto, model_path); + + switch (q_or_dq_scale.data_type()) { + case ONNX_NAMESPACE::TensorProto::FLOAT: + return q_or_dq_scale.data()[0] > 0; + + case ONNX_NAMESPACE::TensorProto::FLOAT16: + return q_or_dq_scale.data()[0] > 0; + + case ONNX_NAMESPACE::TensorProto::BFLOAT16: + return q_or_dq_scale.data()[0] > 0; + + default: + assert(false); + return false; + } +} + #if !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD) bool MatchQNode(const Node& node) { diff --git a/onnxruntime/core/optimizer/qdq_transformer/qdq_util.h b/onnxruntime/core/optimizer/qdq_transformer/qdq_util.h index 5d11b8bfd5558..008f9972a143b 100644 --- a/onnxruntime/core/optimizer/qdq_transformer/qdq_util.h +++ b/onnxruntime/core/optimizer/qdq_transformer/qdq_util.h @@ -65,6 +65,10 @@ bool QOrDQNodeHasConstantScalarScaleAndZeroPoint( const GetConstantInitializerFn& get_const_initializer, bool& zero_point_exists); +// Checks that the y_scale/x_scale input to the QuantizeLinear/DequantizeLinear node is a positive scalar. +bool IsQOrDQScalePositiveConstantScalar(const Node& q_or_dq_node, const GetConstantInitializerFn& get_const_initializer, + const std::filesystem::path& model_path); + #if !defined(ORT_MINIMAL_BUILD) || defined(ORT_EXTENDED_MINIMAL_BUILD) // Check Q node op type, version, and domain. bool MatchQNode(const Node& node); diff --git a/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selector_action_transformer.cc b/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selector_action_transformer.cc index 17e66a3953b97..d81701fdf443b 100644 --- a/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selector_action_transformer.cc +++ b/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selector_action_transformer.cc @@ -35,6 +35,7 @@ void DropQDQNodesRules(SelectorActionRegistry& qdq_selector_action_registry) { // 3 nodes. DQ, target, Q. Merge into target and remove DQ and Q. const std::string drop_action_name{"drop"}; const std::string drop_action_no_int16_name{"drop_no_int16_support"}; + const std::string drop_action_no_int16_and_positive_scale_name{"drop_no_int16_support_and_positive_scale"}; NTO::NodeLocation dq{NTO::NodeType::kInput, 0}; NTO::NodeLocation q{NTO::NodeType::kOutput, 0}; @@ -46,19 +47,32 @@ void DropQDQNodesRules(SelectorActionRegistry& qdq_selector_action_registry) { std::unique_ptr drop_action_no_int16 = std::make_unique( std::vector(moves)); // Copy before std::move(moves) + std::unique_ptr drop_action_no_int16_and_positive_scale = std::make_unique( + std::vector(moves)); // Copy before std::move(moves) std::unique_ptr drop_action = std::make_unique(std::move(moves)); #if !defined(ORT_MINIMAL_BUILD) - // Use a separate selector + action that disallows 16-bit types for MaxPool and Resize. + // Use separate selectors & actions for MaxPool and Resize. + // + // They disallow 16-bit types for MaxPool and Resize: // int16 MaxPool is not supported by the ONNX specification. // int16 Resize is not supported by the ORT implementation (although allowed by ONNX). - std::unique_ptr selector_disallow_16bit = std::make_unique(false); + // + // And cannot eliminate the QDQ for MaxPool if the scale is not positive, as a negative + // scale will change the ordering of the elements between quantized & de-quantized values. + std::unique_ptr selector_no_16bit = std::make_unique(false); qdq_selector_action_registry.RegisterSelectorAndAction(drop_action_no_int16_name, - {{"MaxPool", {12}}, - {"Resize", {}}}, - std::move(selector_disallow_16bit), + {{"Resize", {}}}, + std::move(selector_no_16bit), std::move(drop_action_no_int16)); + std::unique_ptr selector_no_16bit_and_positive_scale = + std::make_unique(false, true, false); + qdq_selector_action_registry.RegisterSelectorAndAction(drop_action_no_int16_and_positive_scale_name, + {{"MaxPool", {12}}}, + std::move(selector_no_16bit_and_positive_scale), + std::move(drop_action_no_int16_and_positive_scale)); + std::unique_ptr selector = std::make_unique(true); qdq_selector_action_registry.RegisterSelectorAndAction(drop_action_name, {{"Gather", {}}, @@ -70,6 +84,9 @@ void DropQDQNodesRules(SelectorActionRegistry& qdq_selector_action_registry) { std::move(drop_action)); #else qdq_selector_action_registry.RegisterAction(drop_action_no_int16_name, std::move(drop_action_no_int16)); + qdq_selector_action_registry.RegisterAction( + drop_action_no_int16_and_positive_scale_name, + std::move(drop_action_no_int16_and_positive_scale)); qdq_selector_action_registry.RegisterAction(drop_action_name, std::move(drop_action)); #endif } diff --git a/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.cc b/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.cc index e271ae8df3356..203aba2c3dd91 100644 --- a/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.cc +++ b/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.cc @@ -150,6 +150,13 @@ bool DropQDQNodeGroupSelector::Check(const GraphViewer& graph_viewer, return graph_viewer.GetConstantInitializer(initializer_name, true); }; + if (!allow_nonpositive_scale_) { + // IsQDQPairSupported will check that the scale is the same between q_node and dq_node. + if (!IsQOrDQScalePositiveConstantScalar(q_node, get_const_initializer, graph_viewer.ModelPath())) { + return false; + } + } + return IsQDQPairSupported(q_node, dq_node, get_const_initializer, graph_viewer.ModelPath()); } diff --git a/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.h b/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.h index 491a15b62cb03..7e009da39403b 100644 --- a/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.h +++ b/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_selectors.h @@ -48,8 +48,9 @@ class NodeGroupSelector { // Zero point and scale are constant scalars and must match class DropQDQNodeGroupSelector : public NodeGroupSelector { public: - explicit DropQDQNodeGroupSelector(bool allow_16bit = true, bool allow_4bit = true) - : allow_16bit_(allow_16bit), allow_4bit_(allow_4bit) {} + explicit DropQDQNodeGroupSelector(bool allow_16bit = true, bool allow_4bit = true, + bool allow_nonpositive_scale = true) + : allow_16bit_(allow_16bit), allow_4bit_(allow_4bit), allow_nonpositive_scale_(allow_nonpositive_scale) {} private: bool Check(const GraphViewer& graph_viewer, const Node& node, @@ -58,6 +59,7 @@ class DropQDQNodeGroupSelector : public NodeGroupSelector { bool allow_16bit_; bool allow_4bit_; + bool allow_nonpositive_scale_; }; // Single DQ -> node. @@ -300,8 +302,8 @@ class BaseSelector : public NodeSelector { class DropQDQNodesSelector : public BaseSelector { public: - explicit DropQDQNodesSelector(bool allow_16bit = false, bool allow_4bit = false) - : BaseSelector(std::make_unique(allow_16bit, allow_4bit)) {} + explicit DropQDQNodesSelector(bool allow_16bit = false, bool allow_4bit = false, bool allow_nonpositive_scale = true) + : BaseSelector(std::make_unique(allow_16bit, allow_4bit, allow_nonpositive_scale)) {} }; class DropDQNodesSelector : public BaseSelector { diff --git a/onnxruntime/test/optimizer/qdq_transformer_test.cc b/onnxruntime/test/optimizer/qdq_transformer_test.cc index 367b4a65e3b7b..a043d6553bdfd 100644 --- a/onnxruntime/test/optimizer/qdq_transformer_test.cc +++ b/onnxruntime/test/optimizer/qdq_transformer_test.cc @@ -980,6 +980,52 @@ TEST(QDQTransformerTests, ReshapeDropQDQ) { RunReshapeDropQDQTestCase({1, 3, 2, 2}, {1, 12}, false, 21); // Use int16 ONNX QDQ ops } +// Runs a test case that checks if Q/DQ nodes are *not* dropped from DQ -> MaxPool -> Q if the quantization scale is +// negative. +template +static void RunMaxPoolNegativeScaleDropQDQTestCase() { + auto build_test_case = [](ModelTestBuilder& builder) { + constexpr QuantType qmin = std::numeric_limits::min(); + constexpr QuantType qmax = std::numeric_limits::max(); + + const std::vector input_shape = {1, 17, 17, 3}; + auto* input_arg = builder.MakeInput(input_shape, qmin, qmax); + auto* output_arg = builder.MakeOutput(); + + constexpr float scale = -0.003f; + QuantType zero_point = 1 + (qmax + qmin) / 2; + + auto* input_arg_dq = builder.MakeIntermediate(); + auto* maxpool_output = builder.MakeIntermediate(); + + builder.AddDequantizeLinearNode(input_arg, scale, zero_point, input_arg_dq); + + Node& maxpool_node = builder.AddNode("MaxPool", {input_arg_dq}, {maxpool_output}); + maxpool_node.AddAttribute("auto_pad", "VALID"); + maxpool_node.AddAttribute("kernel_shape", std::vector({2, 2})); + + builder.AddQuantizeLinearNode(maxpool_output, scale, zero_point, output_arg); + }; + + auto check_graph = [](InferenceSessionWrapper& session) { + auto op_to_count = CountOpsInGraph(session.GetGraph()); + EXPECT_EQ(op_to_count["MaxPool"], 1); + EXPECT_EQ(op_to_count["QuantizeLinear"], 1); + EXPECT_EQ(op_to_count["DequantizeLinear"], 1); + }; + + constexpr int opset = 21; + TransformerTester(build_test_case, check_graph, TransformerLevel::Level1, TransformerLevel::Level2, opset); +} + +// Checks that Q/DQ nodes are *not* dropped from DQ -> MaxPool -> Q for negative scale. Uses 8-bit and 16-bit Q/DQ ops. +TEST(QDQTransformerTests, MaxpoolDontDropQDQForNegativeScale) { + RunMaxPoolNegativeScaleDropQDQTestCase(); + RunMaxPoolNegativeScaleDropQDQTestCase(); + RunMaxPoolNegativeScaleDropQDQTestCase(); + RunMaxPoolNegativeScaleDropQDQTestCase(); +} + // Runs a test case that checks if Q/DQ nodes are dropped from DQ -> (Un)Squeeze -> Q. template static void RunSqueezeUnsqueezeDropQDQTestCase(const std::string& squeeze_type, From 5d78b9a17bb6d126f8ae7fa7eef05cabe4a08dae Mon Sep 17 00:00:00 2001 From: Yifan Li <109183385+yf711@users.noreply.github.com> Date: Mon, 29 Jul 2024 17:27:38 -0700 Subject: [PATCH 54/57] [TensorRT EP] Update TRT OSS Parser to 10.2 (#21552) ### Description Update TRT OSS Parser to [latest 10.2-GA branch](https://github.com/onnx/onnx-tensorrt/commit/f161f95883b4ebd8cb789de5efc67b73c0a6e694) ### Motivation and Context --- cgmanifests/generated/cgmanifest.json | 2 +- cmake/deps.txt | 4 ++-- .../github/azure-pipelines/templates/download-deps.yml | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/cgmanifests/generated/cgmanifest.json b/cgmanifests/generated/cgmanifest.json index 66b305a6d36de..7de3f346f6386 100644 --- a/cgmanifests/generated/cgmanifest.json +++ b/cgmanifests/generated/cgmanifest.json @@ -216,7 +216,7 @@ "component": { "type": "git", "git": { - "commitHash": "06adf4461ac84035bee658c6cf5df39f7ab6071d", + "commitHash": "f161f95883b4ebd8cb789de5efc67b73c0a6e694", "repositoryUrl": "https://github.com/onnx/onnx-tensorrt.git" }, "comments": "onnx_tensorrt" diff --git a/cmake/deps.txt b/cmake/deps.txt index 9d206b6bb3aeb..d0edf963451d5 100644 --- a/cmake/deps.txt +++ b/cmake/deps.txt @@ -37,8 +37,8 @@ mimalloc;https://github.com/microsoft/mimalloc/archive/refs/tags/v2.1.1.zip;d5ee mp11;https://github.com/boostorg/mp11/archive/refs/tags/boost-1.82.0.zip;9bc9e01dffb64d9e0773b2e44d2f22c51aace063 neural_speed;https://github.com/intel/neural-speed/archive/refs/tags/v0.3.zip;5ec64e3071edc7347ebd8a81679cf06e2bb9b851 onnx;https://github.com/onnx/onnx/archive/refs/tags/v1.16.1.zip;2eb9198bb352757d5ff13977cbe0634898e0837c -#use the latest commit of 10.0-GA -onnx_tensorrt;https://github.com/onnx/onnx-tensorrt/archive/06adf4461ac84035bee658c6cf5df39f7ab6071d.zip;46dceef659d75d276e7914a8057c2282269d5e7b +#use the latest commit of 10.2-GA +onnx_tensorrt;https://github.com/onnx/onnx-tensorrt/archive/f161f95883b4ebd8cb789de5efc67b73c0a6e694.zip;2148d0c79a171abf2b9451f3bfec164e85caf2ef protobuf;https://github.com/protocolbuffers/protobuf/archive/refs/tags/v21.12.zip;7cf2733949036c7d52fda017badcab093fe73bfa protoc_win64;https://github.com/protocolbuffers/protobuf/releases/download/v21.12/protoc-21.12-win64.zip;b4521f7ada5b260380f94c4bd7f1b7684c76969a protoc_win32;https://github.com/protocolbuffers/protobuf/releases/download/v21.12/protoc-21.12-win32.zip;3688010318192c46ce73213cdfb6b3e5656da874 diff --git a/tools/ci_build/github/azure-pipelines/templates/download-deps.yml b/tools/ci_build/github/azure-pipelines/templates/download-deps.yml index bf11730c2ce28..01965343c4592 100644 --- a/tools/ci_build/github/azure-pipelines/templates/download-deps.yml +++ b/tools/ci_build/github/azure-pipelines/templates/download-deps.yml @@ -11,7 +11,7 @@ steps: packageType: upack feed: '/7424c8e4-5c62-490e-95c4-79446f31017c' definition: '517c4f6f-5437-4392-a70d-4f15ec5be2f0' - version: 1.0.167 + version: 1.0.173 downloadPath: $(Build.BinariesDirectory)/deps # The private ADO project @@ -22,7 +22,7 @@ steps: packageType: upack feed: '/4c7631f5-24c0-4307-8822-1aa8f180c325' definition: 'fd9dd5ad-b73e-4678-890e-edcf680dbc1a' - version: 1.0.167 + version: 1.0.173 downloadPath: $(Build.BinariesDirectory)/deps # You can add more ADO accounts at here. From 07d3be5b0e037927c3defd8a7e389e59ec748ad8 Mon Sep 17 00:00:00 2001 From: vraspar Date: Mon, 29 Jul 2024 21:04:47 -0700 Subject: [PATCH 55/57] CoreML: Add ML Program Split Op (#21456) ### Description Add support for Split Op ### Motivation and Context Address operator gaps in high priority model. --------- Co-authored-by: Scott McKay Co-authored-by: Edward Chen <18449977+edgchen1@users.noreply.github.com> --- .../coreml/builders/impl/split_op_builder.cc | 138 ++++++++++++------ .../apple/coreml_supported_mlprogram_ops.md | 1 + 2 files changed, 94 insertions(+), 45 deletions(-) diff --git a/onnxruntime/core/providers/coreml/builders/impl/split_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/split_op_builder.cc index 0497357c45c54..dbd0f48576f8b 100644 --- a/onnxruntime/core/providers/coreml/builders/impl/split_op_builder.cc +++ b/onnxruntime/core/providers/coreml/builders/impl/split_op_builder.cc @@ -5,6 +5,7 @@ #include "core/providers/common.h" #include "core/providers/coreml/builders/helper.h" #include "core/providers/coreml/builders/impl/base_op_builder.h" +#include "core/providers/coreml/builders/impl/builder_utils.h" #include "core/providers/coreml/builders/model_builder.h" #include "core/providers/coreml/builders/op_builder_factory.h" #include "core/providers/coreml/shape_utils.h" @@ -24,6 +25,8 @@ class SplitOpBuilder : public BaseOpBuilder { // Split opset 13- uses "split" as attribute. Currently it's not supported. int GetMinSupportedOpSet(const Node& /* node */) const override { return 13; } + + bool SupportsMLProgram() const override { return true; } }; void SplitOpBuilder::AddInitializersToSkip(ModelBuilder& model_builder, const Node& node) const { @@ -43,55 +46,98 @@ Status SplitOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, ORT_RETURN_IF_NOT(GetShape(*node.InputDefs()[0], data_shape, logger), "Failed to get input shape."); NodeAttrHelper helper(node); - const auto axis = helper.Get("axis", 0); + int64_t axis = helper.Get("axis", 0); - // attribute introduced since opset 18 - uint64_t num_outputs; - - std::unique_ptr layer = model_builder.CreateNNLayer(node); - auto* coreml_splitnd = layer->mutable_splitnd(); - coreml_splitnd->set_axis(axis); - - if (input_defs.size() > 1) { - // if "split" is explicitly provided as an input - const auto& split_tensor = *model_builder.GetInitializerTensors().at(input_defs[1]->Name()); - Initializer unpacked_tensor(split_tensor); - auto split_span = unpacked_tensor.DataAsSpan(); - auto split_sizes = split_span.size(); - num_outputs = narrow(split_sizes); - for (size_t i = 0; i < split_sizes; i++) { - coreml_splitnd->add_splitsizes(split_span[i]); - } - } else if (node.SinceVersion() < 18) { - num_outputs = narrow(node.OutputDefs().size()); - coreml_splitnd->set_numsplits(num_outputs); - } else { - // note: for opset 18+ 'num_outputs' is a required attribute - num_outputs = narrow(helper.GetInt64("num_outputs").value()); + auto calculate_remainder_and_chunk_size = [&](int32_t num_outputs) { // note: checked in IsOpSupportedImpl that ensures the dim value at splitting axis exists auto split_dim_size = data_shape[HandleNegativeAxis(axis, data_shape.size())]; - uint64_t chunk_size = narrow((split_dim_size + num_outputs - 1) / num_outputs); + uint64_t chunk_size = (split_dim_size + num_outputs - 1) / num_outputs; uint64_t remainder = split_dim_size % chunk_size; - if (remainder) { - // uneven - auto split_sizes = InlinedVector(num_outputs, chunk_size); - split_sizes.back() = remainder; - for (size_t i = 0; i < split_sizes.size(); i++) { - coreml_splitnd->add_splitsizes(split_sizes[i]); - } + return std::make_tuple(remainder, chunk_size); + }; + +#if defined(COREML_ENABLE_MLPROGRAM) + if (model_builder.CreateMLProgram()) { + using namespace CoreML::Specification::MILSpec; + std::unique_ptr split_op = model_builder.CreateOperation(node, "split"); + AddOperationInput(*split_op, "axis", model_builder.AddScalarConstant(split_op->type(), "axis", axis)); + + if (input_defs.size() > 1) { + // if "split" is explicitly provided as an input + Initializer unpacked_tensor(*model_builder.GetConstantInitializer(input_defs[1]->Name())); + auto split_span = unpacked_tensor.DataAsSpan(); + AddOperationInput(*split_op, "split_sizes", + model_builder.AddConstant(split_op->type(), "split_sizes", split_span)); + } else if (node.SinceVersion() < 18) { + int64_t num_outputs = narrow(node.OutputDefs().size()); + AddOperationInput(*split_op, "num_splits", + model_builder.AddScalarConstant(split_op->type(), "num_splits", num_outputs)); } else { - // even + // note: for opset 18+ 'num_outputs' is a required attribute + int64_t num_outputs = helper.GetInt64("num_outputs").value(); + auto [remainder, chunk_size] = calculate_remainder_and_chunk_size(static_cast(num_outputs)); + if (remainder) { + // uneven + std::vector split_sizes(num_outputs, chunk_size); + split_sizes.back() = remainder; + AddOperationInput(*split_op, "split_sizes", + model_builder.AddConstant(split_op->type(), "split_sizes", split_sizes)); + } else { + // even + AddOperationInput(*split_op, "num_splits", + model_builder.AddScalarConstant(split_op->type(), "num_splits", num_outputs)); + } + } + + AddOperationInput(*split_op, "x", input_defs[0]->Name()); + for (const auto& output_def : node.OutputDefs()) { + AddOperationOutput(*split_op, *output_def); + } + model_builder.AddOperation(std::move(split_op)); + + } else +#endif + { + std::unique_ptr layer = model_builder.CreateNNLayer(node); + auto* coreml_splitnd = layer->mutable_splitnd(); + coreml_splitnd->set_axis(axis); + + if (input_defs.size() > 1) { + // if "split" is explicitly provided as an input + // const auto& split_tensor = *model_builder.GetInitializerTensors().at(input_defs[1]->Name()); + Initializer unpacked_tensor(*model_builder.GetConstantInitializer(input_defs[1]->Name())); + auto split_span = unpacked_tensor.DataAsSpan(); + for (const auto& split_size : split_span) { + coreml_splitnd->add_splitsizes(split_size); + } + } else if (node.SinceVersion() < 18) { + uint64_t num_outputs = narrow(node.OutputDefs().size()); coreml_splitnd->set_numsplits(num_outputs); + } else { + // note: for opset 18+ 'num_outputs' is a required attribute + uint64_t num_outputs = narrow(helper.GetInt64("num_outputs").value()); + auto [remainder, chunk_size] = calculate_remainder_and_chunk_size(static_cast(num_outputs)); + if (remainder) { + // uneven + auto split_sizes = InlinedVector(num_outputs, chunk_size); + split_sizes.back() = remainder; + for (size_t i = 0; i < split_sizes.size(); i++) { + coreml_splitnd->add_splitsizes(split_sizes[i]); + } + } else { + // even + coreml_splitnd->set_numsplits(num_outputs); + } } - } - *layer->mutable_input()->Add() = node.InputDefs()[0]->Name(); - // variadic number of outputs. Calculated based on the length of the given splitSizes if provided. - // Otherwise, uses attribute value 'num_outputs'. - for (uint64_t i = 0; i < num_outputs; i++) { - *layer->mutable_output()->Add() = node.OutputDefs()[i]->Name(); + *layer->mutable_input()->Add() = node.InputDefs()[0]->Name(); + // variadic number of outputs. Calculated based on the length of the given splitSizes if provided. + // Otherwise, uses attribute value 'num_outputs'. + for (const auto& output_def : node.OutputDefs()) { + *layer->mutable_output()->Add() = output_def->Name(); + } + model_builder.AddLayer(std::move(layer)); } - model_builder.AddLayer(std::move(layer)); return Status::OK(); } @@ -99,7 +145,6 @@ Status SplitOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, bool SplitOpBuilder::IsOpSupportedImpl(const Node& node, const OpBuilderInputParams& input_params, const logging::Logger& logger) const { const auto& input_defs = node.InputDefs(); - const auto& initializers = input_params.graph_viewer.GetAllInitializedTensors(); NodeAttrHelper helper(node); const auto axis = helper.Get("axis", 0); @@ -110,16 +155,19 @@ bool SplitOpBuilder::IsOpSupportedImpl(const Node& node, const OpBuilderInputPar const auto split_dims_at_axis = input_shape[HandleNegativeAxis(axis, input_shape.size())]; if (input_defs.size() > 1 && input_defs[1]->Exists()) { - if (!CheckIsConstantInitializer(*input_defs[1], input_params.graph_viewer, logger, "'split'")) { + const auto* splits_tensor = input_params.graph_viewer.GetConstantInitializer(input_defs[1]->Name()); + if (!splits_tensor) { + LOGS(logger, VERBOSE) << "CoreML 'splits' input must be a constant initializer."; return false; } + const auto split_shape = *input_defs[1]->Shape(); if (split_shape.dim_size() < 2) { - LOGS(logger, VERBOSE) << "CoreML SplitND requires to produce at least 2 outputs."; + LOGS(logger, VERBOSE) << "CoreML Split must produce at least 2 outputs."; return false; } - const auto& splits_tensor = *initializers.at(input_defs[1]->Name()); - Initializer unpacked_tensor(splits_tensor); + + Initializer unpacked_tensor(*splits_tensor); auto splits_span = unpacked_tensor.DataAsSpan(); int64_t sum_of_splits = std::accumulate(splits_span.begin(), splits_span.end(), int64_t{0}); if (sum_of_splits != split_dims_at_axis) { diff --git a/tools/ci_build/github/apple/coreml_supported_mlprogram_ops.md b/tools/ci_build/github/apple/coreml_supported_mlprogram_ops.md index d2a961f17bd6a..b546c266c131b 100644 --- a/tools/ci_build/github/apple/coreml_supported_mlprogram_ops.md +++ b/tools/ci_build/github/apple/coreml_supported_mlprogram_ops.md @@ -24,6 +24,7 @@ Keep in sync with doco generated from /docs/execution-providers/CoreML-Execution |ai.onnx:Reshape|| |ai.onnx:Resize|See [resize_op_builder.cc](https://github.com/microsoft/onnxruntime/blob/main/onnxruntime/core/providers/coreml/builders/impl/resize_op_builder.cc) implementation. There are too many permutations to describe the valid combinations.| |ai.onnx.Slice|starts/ends/axes/steps must be constant initializers.| +|ai.onnx:Split|| |ai.onnx:Sub|| |ai.onnx:Sigmoid|| |ai:onnx:Tanh|| From 82036b04978b7930185996a70d2146c2895469ea Mon Sep 17 00:00:00 2001 From: Changming Sun Date: Mon, 29 Jul 2024 21:59:16 -0700 Subject: [PATCH 56/57] Remove references to the outdated CUDA EP factory method (#21549) The function "OrtSessionOptionsAppendExecutionProvider_CUDA" is deprecated. --- .../global_thread_pools/test_inference.cc | 4 +++- onnxruntime/test/shared_lib/test_inference.cc | 20 ++++++++++++++----- .../test/shared_lib/test_model_loading.cc | 5 +++-- 3 files changed, 21 insertions(+), 8 deletions(-) diff --git a/onnxruntime/test/global_thread_pools/test_inference.cc b/onnxruntime/test/global_thread_pools/test_inference.cc index f553682975f11..c6d958536f488 100644 --- a/onnxruntime/test/global_thread_pools/test_inference.cc +++ b/onnxruntime/test/global_thread_pools/test_inference.cc @@ -74,7 +74,9 @@ static Ort::Session GetSessionObj(Ort::Env& env, T model_uri, int provider_type) if (provider_type == 1) { #ifdef USE_CUDA - Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_CUDA(session_options, 0)); + OrtCUDAProviderOptionsV2* options; + Ort::ThrowOnError(Ort::GetApi().CreateCUDAProviderOptions(&options)); + session_options.AppendExecutionProvider_CUDA_V2(*options); std::cout << "Running simple inference with cuda provider" << std::endl; #else return Ort::Session(nullptr); diff --git a/onnxruntime/test/shared_lib/test_inference.cc b/onnxruntime/test/shared_lib/test_inference.cc index 52491a179c2ce..7a33bf8a527cd 100644 --- a/onnxruntime/test/shared_lib/test_inference.cc +++ b/onnxruntime/test/shared_lib/test_inference.cc @@ -1959,7 +1959,9 @@ TEST(CApiTest, get_allocator_cpu) { #ifdef USE_CUDA TEST(CApiTest, get_allocator_cuda) { Ort::SessionOptions session_options; - Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_CUDA(session_options, 0)); + OrtCUDAProviderOptionsV2* options; + Ort::ThrowOnError(Ort::GetApi().CreateCUDAProviderOptions(&options)); + session_options.AppendExecutionProvider_CUDA_V2(*options); Ort::Session session(*ort_env, NAMED_AND_ANON_DIM_PARAM_URI, session_options); Ort::MemoryInfo info_cuda("Cuda", OrtAllocatorType::OrtArenaAllocator, 0, OrtMemTypeDefault); @@ -2076,7 +2078,9 @@ TEST(CApiTest, io_binding_cuda) { #ifdef USE_TENSORRT Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_Tensorrt(session_options, 0)); #else - Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_CUDA(session_options, 0)); + OrtCUDAProviderOptionsV2* options; + Ort::ThrowOnError(Ort::GetApi().CreateCUDAProviderOptions(&options)); + session_options.AppendExecutionProvider_CUDA_V2(*options); #endif Ort::Session session(*ort_env, MODEL_URI, session_options); @@ -3438,7 +3442,9 @@ TEST(CApiTest, AllocateInitializersFromNonArenaMemory) { Ort::SessionOptions session_options; #ifdef USE_CUDA - Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_CUDA(session_options, 0)); + OrtCUDAProviderOptionsV2* options; + Ort::ThrowOnError(Ort::GetApi().CreateCUDAProviderOptions(&options)); + session_options.AppendExecutionProvider_CUDA_V2(*options); #else // arena is enabled but the sole initializer will still be allocated from non-arena memory Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_CPU(session_options, 1)); @@ -3890,7 +3896,9 @@ TEST(CApiTest, GitHubIssue10179) { try { const auto* model_path = MODEL_URI; Ort::SessionOptions session_options{}; - Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_CUDA(session_options, 0)); + OrtCUDAProviderOptionsV2* options; + Ort::ThrowOnError(Ort::GetApi().CreateCUDAProviderOptions(&options)); + session_options.AppendExecutionProvider_CUDA_V2(*options); Ort::Session session{*ort_env, model_path, session_options}; } catch (const std::exception& e) { std::cerr << "exception: " << e.what() << "\n"; @@ -3920,7 +3928,9 @@ TEST(CApiTest, GitHubIssue10179) { TEST(CApiTest, TestCudaMemcpyToHostWithSequenceTensors) { const auto* model_path = SEQUENCE_MODEL_URI_2; Ort::SessionOptions session_options{}; - Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_CUDA(session_options, 0)); + OrtCUDAProviderOptionsV2* options; + Ort::ThrowOnError(Ort::GetApi().CreateCUDAProviderOptions(&options)); + session_options.AppendExecutionProvider_CUDA_V2(*options); Ort::Session session{*ort_env, model_path, session_options}; Ort::MemoryInfo info("Cpu", OrtDeviceAllocator, 0, OrtMemTypeDefault); diff --git a/onnxruntime/test/shared_lib/test_model_loading.cc b/onnxruntime/test/shared_lib/test_model_loading.cc index b7f6f7f4b9a77..5694398b9cb10 100644 --- a/onnxruntime/test/shared_lib/test_model_loading.cc +++ b/onnxruntime/test/shared_lib/test_model_loading.cc @@ -60,8 +60,9 @@ TEST(CApiTest, model_from_array) { create_session(so); #ifdef USE_CUDA - // test with CUDA provider when using onnxruntime as dll - Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_CUDA(so, 0)); + OrtCUDAProviderOptionsV2* options; + Ort::ThrowOnError(Ort::GetApi().CreateCUDAProviderOptions(&options)); + so.AppendExecutionProvider_CUDA_V2(*options); create_session(so); #endif } From 530a2d7b41b0584f67ddfef6679a79e9dbeee556 Mon Sep 17 00:00:00 2001 From: Yi-Hong Lyu Date: Tue, 30 Jul 2024 03:49:14 -0700 Subject: [PATCH 57/57] Enable FP16 Clip and Handle Bias in FP16 Depthwise Conv (#21493) - Improved accuracy for face-detection, image-classification, and object-detection in the GeekBench ML benchmark on ARM64. - Fixed issue https://github.com/microsoft/onnxruntime/issues/18992 --- docs/OperatorKernels.md | 4 +- onnxruntime/core/mlas/inc/mlas.h | 2 + onnxruntime/core/mlas/lib/dwconv.cpp | 32 +-- onnxruntime/core/mlas/lib/fp16_common.h | 17 ++ .../core/providers/cpu/fp16/fp16_conv.cc | 4 +- onnxruntime/core/providers/cpu/math/clip.cc | 2 +- .../test/providers/cpu/math/clip_test.cc | 18 ++ .../test/providers/cpu/nn/conv_fp16_test.cc | 237 +++++++++++++++++- .../test/providers/cpu/nn/conv_op_test.cc | 235 +++++++++++++++++ 9 files changed, 531 insertions(+), 20 deletions(-) diff --git a/docs/OperatorKernels.md b/docs/OperatorKernels.md index 211c53d0fecc8..f265c9f985070 100644 --- a/docs/OperatorKernels.md +++ b/docs/OperatorKernels.md @@ -58,8 +58,8 @@ Do not modify directly.* |Ceil|*in* X:**T**
*out* Y:**T**|13+|**T** = tensor(double), tensor(float)| |||[6, 12]|**T** = tensor(double), tensor(float)| |Celu|*in* X:**T**
*out* Y:**T**|12+|**T** = tensor(float)| -|Clip|*in* input:**T**
*in* min:**T**
*in* max:**T**
*out* output:**T**

or

*in* input:**T**
*out* output:**T**|13+|**T** = tensor(double), tensor(float), tensor(int32), tensor(int64), tensor(int8), tensor(uint32), tensor(uint64), tensor(uint8)| -|||12|**T** = tensor(double), tensor(float), tensor(int32), tensor(int64), tensor(int8), tensor(uint32), tensor(uint64), tensor(uint8)| +|Clip|*in* input:**T**
*in* min:**T**
*in* max:**T**
*out* output:**T**

or

*in* input:**T**
*out* output:**T**|13+|**T** = tensor(double), tensor(float), tensor(float16), tensor(int32), tensor(int64), tensor(int8), tensor(uint32), tensor(uint64), tensor(uint8)| +|||12|**T** = tensor(double), tensor(float), tensor(float16), tensor(int32), tensor(int64), tensor(int8), tensor(uint32), tensor(uint64), tensor(uint8)| |||11|**T** = tensor(float)| |||[6, 10]|**T** = tensor(float)| |Col2Im|*in* input:**T**
*in* image_shape:**tensor(int64)**
*in* block_shape:**tensor(int64)**
*out* output:**T**|18+|**T** = tensor(float)| diff --git a/onnxruntime/core/mlas/inc/mlas.h b/onnxruntime/core/mlas/inc/mlas.h index 675f7c7a13e8c..e46105324a7fb 100644 --- a/onnxruntime/core/mlas/inc/mlas.h +++ b/onnxruntime/core/mlas/inc/mlas.h @@ -1751,6 +1751,7 @@ MlasSBGemmConvertPackB(size_t N, size_t K, const float* B, size_t ldb, void* Pac * @brief Indirect Depthwise convolution for fp16 * @param Input Supplies the indirect buffer for NHWC input * @param Filter Supplies the address for filter tensor + * @param Bias Supplies the address for 1D bias tensor B, has size of M * @param Output Supplies the address for the result tensor * @param Channels # of input channels * @param OutputCount # of output pixels @@ -1762,6 +1763,7 @@ MLASCALL MlasConvDepthwise( const MLAS_FP16* const* Input, const MLAS_FP16* Filter, + const MLAS_FP16* Bias, MLAS_FP16* Output, size_t Channels, size_t OutputCount, diff --git a/onnxruntime/core/mlas/lib/dwconv.cpp b/onnxruntime/core/mlas/lib/dwconv.cpp index 15511d2d8ceac..d48d9cbb17502 100644 --- a/onnxruntime/core/mlas/lib/dwconv.cpp +++ b/onnxruntime/core/mlas/lib/dwconv.cpp @@ -14,7 +14,6 @@ Module Name: --*/ - #include "fp16_common.h" #ifdef MLAS_F16VEC_INTRINSICS_SUPPORTED @@ -24,19 +23,20 @@ void MlasConvDepthwiseKernel( const _mlas_fp16_* const* Input, const _mlas_fp16_* Filter, + const _mlas_fp16_* Bias, _mlas_fp16_* Output, size_t Channels, size_t OutputCount, size_t KernelSize, MLAS_HALF_GEMM_POSTPROCESSOR* PostProc - ) +) { while (OutputCount > 0) { size_t ChannelOffset = 0; size_t c = Channels; while (c >= 8) { - MLAS_FLOAT16X8 Accumulator = MlasZeroFloat16x8(); + MLAS_FLOAT16X8 Accumulator = Bias == nullptr ? MlasZeroFloat16x8() : MlasLoadFloat16x8(&Bias[ChannelOffset]); size_t ChannelKernelOffset = ChannelOffset; for (size_t k = 0; k < KernelSize; k++) { @@ -54,7 +54,7 @@ MlasConvDepthwiseKernel( } if (c >= 4) { - MLAS_FLOAT16X4 Accumulator = MlasZeroFloat16x4(); + MLAS_FLOAT16X4 Accumulator = Bias == nullptr ? MlasZeroFloat16x4() : MlasLoadFloat16x4(&Bias[ChannelOffset]); size_t ChannelKernelOffset = ChannelOffset; for (size_t k = 0; k < KernelSize; k++) { @@ -72,7 +72,8 @@ MlasConvDepthwiseKernel( } if (c > 0) { - MLAS_FLOAT16X4 Accumulator = MlasZeroFloat16x4(); + MLAS_FLOAT16X4 Accumulator = + Bias == nullptr ? MlasZeroFloat16x4() : MlasLoadPartialFloat16x4(&Bias[ChannelOffset], c); size_t ChannelKernelOffset = ChannelOffset; for (size_t k = 0; k < KernelSize; k++) { @@ -86,8 +87,7 @@ MlasConvDepthwiseKernel( Output += c; } if (PostProc) { - PostProc->Process(reinterpret_cast(Output - Channels), 0, 0, 1, Channels, - Channels); + PostProc->Process(reinterpret_cast(Output - Channels), 0, 0, 1, Channels, Channels); } Input += KernelSize; OutputCount -= 1; @@ -101,16 +101,17 @@ void MlasConvDepthwiseKernel( const _mlas_fp16_* const* Input, const _mlas_fp16_* Filter, + const _mlas_fp16_* Bias, _mlas_fp16_* Output, size_t Channels, size_t OutputCount, size_t KernelSize, MLAS_HALF_GEMM_POSTPROCESSOR* PostProc - ) +) { while (OutputCount > 0) { for (size_t ChannelOffset = 0; ChannelOffset < Channels; ChannelOffset++) { - float Accumulator = 0.0f; + float Accumulator = Bias == nullptr ? 0.0f : MLAS_Half2Float(Bias[ChannelOffset]); size_t ChannelKernelOffset = ChannelOffset; for (size_t k = 0; k < KernelSize; k++) { @@ -120,35 +121,36 @@ MlasConvDepthwiseKernel( *Output++ = MLAS_Float2Half(Accumulator); } if (PostProc) { - PostProc->Process(reinterpret_cast(Output - Channels), 0, 0, 1, Channels, - Channels); + PostProc->Process(reinterpret_cast(Output - Channels), 0, 0, 1, Channels, Channels); } Input += KernelSize; OutputCount -= 1; } } -#endif // MLAS_F16VEC_INTRINSICS_SUPPORTED - +#endif // MLAS_F16VEC_INTRINSICS_SUPPORTED void MLASCALL MlasConvDepthwise( const MLAS_FP16* const* Input, const MLAS_FP16* Filter, + const MLAS_FP16* Bias, MLAS_FP16* Output, size_t Channels, size_t OutputCount, size_t KernelSize, MLAS_HALF_GEMM_POSTPROCESSOR* PostProc - ) +) { MlasConvDepthwiseKernel( reinterpret_cast(Input), reinterpret_cast(Filter), + reinterpret_cast(Bias), reinterpret_cast<_mlas_fp16_*>(Output), Channels, OutputCount, KernelSize, - PostProc); + PostProc + ); } diff --git a/onnxruntime/core/mlas/lib/fp16_common.h b/onnxruntime/core/mlas/lib/fp16_common.h index 1fcab870af64f..30b66cdb2ea78 100644 --- a/onnxruntime/core/mlas/lib/fp16_common.h +++ b/onnxruntime/core/mlas/lib/fp16_common.h @@ -64,6 +64,23 @@ MLAS_FORCEINLINE MLAS_FLOAT16X4 MlasLoadFloat16x4(const _mlas_fp16_* Buffer) { return vreinterpret_f16_u16(vld1_u16(Buffer)); } +MLAS_FORCEINLINE +MLAS_FLOAT16X4 +MlasLoadPartialFloat16x4(const _mlas_fp16_* Buffer, size_t len) +{ + MLAS_FLOAT16X4 Vector = MlasZeroFloat16x4(); + if ((len & 1) != 0) { + Vector = vreinterpret_f16_u16(vld1_lane_u16(Buffer + (len - 1), vreinterpret_u16_f16(Vector), 0)); + } + if ((len & 2) != 0) { + Vector = vreinterpret_f16_f32(vdup_lane_f32(vreinterpret_f32_f16(Vector), 0)); + Vector = vreinterpret_f16_f32( + vld1_lane_f32(reinterpret_cast(Buffer), vreinterpret_f32_f16(Vector), 0) + ); + } + return Vector; +} + MLAS_FORCEINLINE void MlasStoreFloat16x8(_mlas_fp16_* Buffer, MLAS_FLOAT16X8 Vector) diff --git a/onnxruntime/core/providers/cpu/fp16/fp16_conv.cc b/onnxruntime/core/providers/cpu/fp16/fp16_conv.cc index e6867f10819ae..37db095e92570 100644 --- a/onnxruntime/core/providers/cpu/fp16/fp16_conv.cc +++ b/onnxruntime/core/providers/cpu/fp16/fp16_conv.cc @@ -139,8 +139,9 @@ Status FusedConvFp16::PrePack(const Tensor& tensor, int input_idx, AllocatorPtr bool share_prepacked_weights = (prepacked_weights != nullptr); + const bool is_depthwise_conv = (group_input_channels == 1 && group_output_channels == 1); // Don't pack the filter buffer if the MlasConvDepthwise path is used. - if (!(group_input_channels == 1 && group_output_channels == 1)) { + if (!is_depthwise_conv) { packed_W_size_ = MlasHalfGemmPackBSize(group_output_channels, kernel_dim, false); if (packed_W_size_ != 0) { size_t packed_W_data_size = SafeInt(group_count) * packed_W_size_; @@ -472,6 +473,7 @@ Status FusedConvFp16::Compute(OpKernelContext* context) const { MlasConvDepthwise( worker_indirection_buffer, reordered_W, + Bdata, worker_output, static_cast(M), static_cast(output_count), diff --git a/onnxruntime/core/providers/cpu/math/clip.cc b/onnxruntime/core/providers/cpu/math/clip.cc index ddb64a5a0e461..200469bc47835 100644 --- a/onnxruntime/core/providers/cpu/math/clip.cc +++ b/onnxruntime/core/providers/cpu/math/clip.cc @@ -23,7 +23,7 @@ ORT_SPECIFY_OP_KERNEL_ARG_DEFAULT_TYPES( float); ORT_SPECIFY_OP_KERNEL_ARG_DEFAULT_TYPES( kCpuExecutionProvider, kOnnxDomain, Clip, 12, Input, 0, - float, double, int8_t, uint8_t, int32_t, uint32_t, int64_t, uint64_t); + float, MLFloat16, double, int8_t, uint8_t, int32_t, uint32_t, int64_t, uint64_t); } // namespace op_kernel_type_control using EnabledClip11Types = ORT_OP_KERNEL_ARG_ENABLED_TYPE_LIST( diff --git a/onnxruntime/test/providers/cpu/math/clip_test.cc b/onnxruntime/test/providers/cpu/math/clip_test.cc index 6f81bbbe31d54..9948a6cc8a681 100644 --- a/onnxruntime/test/providers/cpu/math/clip_test.cc +++ b/onnxruntime/test/providers/cpu/math/clip_test.cc @@ -119,6 +119,24 @@ TEST(MathOpTest, Clip_Default_uint64) { test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider}); } +TEST(MathOpTest, Clip_MLFloat16) { + OpTester test("Clip", 12); + + std::vector dims{3, 3}; + test.AddInput("X", dims, + {MLFloat16(-1.0f), MLFloat16(-2.0f), MLFloat16(-3.0f), + MLFloat16(-4.0f), MLFloat16(0.0f), MLFloat16(2.0f), + MLFloat16(4.0f), MLFloat16(6.0f), MLFloat16(8.0f)}); + test.AddInput("min", {}, {MLFloat16(0.0f)}); + test.AddInput("max", {}, {MLFloat16(6.0f)}); + test.AddOutput("Y", dims, + {MLFloat16(0.0f), MLFloat16(0.0f), MLFloat16(0.0f), + MLFloat16(0.0f), MLFloat16(0.0f), MLFloat16(2.0f), + MLFloat16(4.0f), MLFloat16(6.0f), MLFloat16(6.0f)}); + + test.Run(); +} + TEST(MathOpTest, Clip_int32) { OpTester test("Clip", 12); diff --git a/onnxruntime/test/providers/cpu/nn/conv_fp16_test.cc b/onnxruntime/test/providers/cpu/nn/conv_fp16_test.cc index cb5fc8095982c..95b274966fbbb 100644 --- a/onnxruntime/test/providers/cpu/nn/conv_fp16_test.cc +++ b/onnxruntime/test/providers/cpu/nn/conv_fp16_test.cc @@ -714,6 +714,241 @@ TEST(ConvFp16Test, Conv2D_group) { TestConvFp16Op(attrs, {X, W}, {X_shape, W_shape}, expected_vals, Y_shape, true); } +TEST(ConvFp16Test, Depthwise2D_Bias_Group1_Issue18992) { + ConvOpAndTestAttributes attrs = { + "", // auto_pad + vector{1, 1}, // dilations + 1, // group + vector{1, 1}, // kernel_shape + vector{0, 0, 0, 0}, // pads + vector{1, 1}, // strides + {} // excluded EPs + }; + + vector X = {MLFloat16(1.0f)}; + vector X_shape = {1, 1, 1, 1}; + vector W = {MLFloat16(0.5f)}; + vector W_shape = {1, 1, 1, 1}; + vector B = {MLFloat16(0.5f)}; + vector B_shape = {1}; + vector Y_shape = {1, 1, 1, 1}; + auto expected_vals = {MLFloat16(1.0f)}; + + TestConvFp16Op(attrs, {X, W, B}, {X_shape, W_shape, B_shape}, expected_vals, Y_shape); + TestConvFp16Op(attrs, {X, W, B}, {X_shape, W_shape, B_shape}, expected_vals, Y_shape, true); +} + +TEST(ConvFp16Test, Depthwise2D_Bias_Group2) { + ConvOpAndTestAttributes attrs = { + "", // auto_pad + vector{1, 1}, // dilations + 2, // group + vector{1, 1}, // kernel_shape + vector{0, 0, 0, 0}, // pads + vector{1, 1}, // strides + {} // excluded EPs + }; + + vector X = { + MLFloat16(0.0f), MLFloat16(1.0f), MLFloat16(2.0f), + MLFloat16(3.0f), MLFloat16(4.0f), MLFloat16(5.0f), + MLFloat16(6.0f), MLFloat16(7.0f), MLFloat16(8.0f), + + MLFloat16(9.0f), MLFloat16(10.0f), MLFloat16(11.0f), + MLFloat16(12.0f), MLFloat16(13.0f), MLFloat16(14.0f), + MLFloat16(15.0f), MLFloat16(16.0f), MLFloat16(17.0f)}; + vector X_shape = {1, 2, 3, 3}; + vector W = {MLFloat16(1.0f), MLFloat16(2.0f)}; + vector W_shape = {2, 1, 1, 1}; + vector B = {MLFloat16(1.0f), MLFloat16(-1.0f)}; + vector B_shape = {2}; + vector Y_shape = {1, 2, 3, 3}; + auto expected_vals = { + MLFloat16(1.0f), MLFloat16(2.0f), MLFloat16(3.0f), + MLFloat16(4.0f), MLFloat16(5.0f), MLFloat16(6.0f), + MLFloat16(7.0f), MLFloat16(8.0f), MLFloat16(9.0f), + + MLFloat16(17.0f), MLFloat16(19.0f), MLFloat16(21.0f), + MLFloat16(23.0f), MLFloat16(25.0f), MLFloat16(27.0f), + MLFloat16(29.0f), MLFloat16(31.0f), MLFloat16(33.0f)}; + + TestConvFp16Op(attrs, {X, W, B}, {X_shape, W_shape, B_shape}, expected_vals, Y_shape); + TestConvFp16Op(attrs, {X, W, B}, {X_shape, W_shape, B_shape}, expected_vals, Y_shape, true); +} + +TEST(ConvFp16Test, Depthwise2D_Bias_Group15) { + ConvOpAndTestAttributes attrs = { + "", // auto_pad + vector{1, 1}, // dilations + 15, // group + vector{2, 2}, // kernel_shape + vector{0, 0, 0, 0}, // pads + vector{1, 1}, // strides + {} // excluded EPs + }; + + vector X = { + // C = 0 + MLFloat16(0.0f), MLFloat16(1.0f), + MLFloat16(2.0f), MLFloat16(3.0f), + + // C = 1 + MLFloat16(4.0f), MLFloat16(5.0f), + MLFloat16(6.0f), MLFloat16(7.0f), + + // C = 2 + MLFloat16(8.0f), MLFloat16(9.0f), + MLFloat16(10.0f), MLFloat16(11.0f), + + // C = 3 + MLFloat16(12.0f), MLFloat16(13.0f), + MLFloat16(14.0f), MLFloat16(15.0f), + + // C = 4 + MLFloat16(16.0f), MLFloat16(17.0f), + MLFloat16(18.0f), MLFloat16(19.0f), + + // C = 5 + MLFloat16(20.0f), MLFloat16(21.0f), + MLFloat16(22.0f), MLFloat16(23.0f), + + // C = 6 + MLFloat16(24.0f), MLFloat16(25.0f), + MLFloat16(26.0f), MLFloat16(27.0f), + + // C = 7 + MLFloat16(28.0f), MLFloat16(29.0f), + MLFloat16(30.0f), MLFloat16(31.0f), + + // C = 8 + MLFloat16(32.0f), MLFloat16(33.0f), + MLFloat16(34.0f), MLFloat16(35.0f), + + // C = 9 + MLFloat16(36.0f), MLFloat16(37.0f), + MLFloat16(38.0f), MLFloat16(39.0f), + + // C = 10 + MLFloat16(40.0f), MLFloat16(41.0f), + MLFloat16(42.0f), MLFloat16(43.0f), + + // C = 11 + MLFloat16(44.0f), MLFloat16(45.0f), + MLFloat16(46.0f), MLFloat16(47.0f), + + // C = 12 + MLFloat16(48.0f), MLFloat16(49.0f), + MLFloat16(50.0f), MLFloat16(51.0f), + + // C = 13 + MLFloat16(52.0f), MLFloat16(53.0f), + MLFloat16(54.0f), MLFloat16(55.0f), + + // C = 14 + MLFloat16(56.0f), MLFloat16(57.0f), + MLFloat16(58.0f), MLFloat16(59.0f)}; + vector X_shape = {1, 15, 2, 2}; + vector W = { + // M = 0 + MLFloat16(0.0f), MLFloat16(1.0f), + MLFloat16(2.0f), MLFloat16(3.0f), + + // M = 1 + MLFloat16(4.0f), MLFloat16(5.0f), + MLFloat16(6.0f), MLFloat16(7.0f), + + // M = 2 + MLFloat16(8.0f), MLFloat16(9.0f), + MLFloat16(10.0f), MLFloat16(11.0f), + + // M = 3 + MLFloat16(12.0f), MLFloat16(13.0f), + MLFloat16(14.0f), MLFloat16(15.0f), + + // M = 4 + MLFloat16(16.0f), MLFloat16(17.0f), + MLFloat16(18.0f), MLFloat16(19.0f), + + // M = 5 + MLFloat16(20.0f), MLFloat16(21.0f), + MLFloat16(22.0f), MLFloat16(23.0f), + + // M = 6 + MLFloat16(24.0f), MLFloat16(25.0f), + MLFloat16(26.0f), MLFloat16(27.0f), + + // M = 7 + MLFloat16(28.0f), MLFloat16(29.0f), + MLFloat16(30.0f), MLFloat16(31.0f), + + // M = 8 + MLFloat16(32.0f), MLFloat16(33.0f), + MLFloat16(34.0f), MLFloat16(35.0f), + + // M = 9 + MLFloat16(36.0f), MLFloat16(37.0f), + MLFloat16(38.0f), MLFloat16(39.0f), + + // M = 10 + MLFloat16(40.0f), MLFloat16(41.0f), + MLFloat16(42.0f), MLFloat16(43.0f), + + // M = 11 + MLFloat16(44.0f), MLFloat16(45.0f), + MLFloat16(46.0f), MLFloat16(47.0f), + + // M = 12 + MLFloat16(48.0f), MLFloat16(49.0f), + MLFloat16(50.0f), MLFloat16(51.0f), + + // M = 13 + MLFloat16(52.0f), MLFloat16(53.0f), + MLFloat16(54.0f), MLFloat16(55.0f), + + // M = 14 + MLFloat16(56.0f), MLFloat16(57.0f), + MLFloat16(58.0f), MLFloat16(59.0f)}; + vector W_shape = {15, 1, 2, 2}; + vector B = { + MLFloat16(101.0f), + MLFloat16(102.0f), + MLFloat16(103.0f), + MLFloat16(104.0f), + MLFloat16(105.0f), + MLFloat16(106.0f), + MLFloat16(107.0f), + MLFloat16(108.0f), + MLFloat16(109.0f), + MLFloat16(110.0f), + MLFloat16(111.0f), + MLFloat16(112.0f), + MLFloat16(113.0f), + MLFloat16(114.0f), + MLFloat16(115.0f)}; + vector B_shape = {15}; + vector Y_shape = {1, 15, 1, 1}; + auto expected_vals = { + MLFloat16(115.0f), // 0.0*0.0 + 1.0*1.0 + 2.0*2.0 + 3.0*3.0 + 101.0 + MLFloat16(228.0f), + MLFloat16(469.0f), + MLFloat16(838.0f), + MLFloat16(1335.0f), + MLFloat16(1960.0f), + MLFloat16(2713.0f), // 24.0*24.0 + 25.0*25.0 + 26.0*26.0 + 27.0*27.0 + 107.0 + MLFloat16(3594.0f), + MLFloat16(4603.0f), + MLFloat16(5740.0f), + MLFloat16(7005.0f), + MLFloat16(8398.0f), + MLFloat16(9919.0f), // 48.0*48.0 + 49.0*49.0 + 50.0*50.0 + 51.0*51.0 + 113.0 + MLFloat16(11568.0f), // 52.0*52.0 + 53.0*53.0 + 54.0*54.0 + 55.0*55.0 + 114.0 + MLFloat16(13345.0f) // 56.0*56.0 + 57.0*57.0 + 58.0*58.0 + 59.0*59.0 + 115.0 + }; + + TestConvFp16Op(attrs, {X, W, B}, {X_shape, W_shape, B_shape}, expected_vals, Y_shape); + TestConvFp16Op(attrs, {X, W, B}, {X_shape, W_shape, B_shape}, expected_vals, Y_shape, true); +} + TEST(ConvFp16Test, ConvDimWithZero) { ConvOpAndTestAttributes attrs = { "", // auto_pad @@ -1074,4 +1309,4 @@ TEST(ConvFp16Test, SharedPrepackedWeights) { } // namespace test } // namespace onnxruntime -#endif // MLAS_F16VEC_INTRINSICS_SUPPORTED \ No newline at end of file +#endif // MLAS_F16VEC_INTRINSICS_SUPPORTED diff --git a/onnxruntime/test/providers/cpu/nn/conv_op_test.cc b/onnxruntime/test/providers/cpu/nn/conv_op_test.cc index 0efa78af2795c..2d885ee9d479f 100644 --- a/onnxruntime/test/providers/cpu/nn/conv_op_test.cc +++ b/onnxruntime/test/providers/cpu/nn/conv_op_test.cc @@ -647,6 +647,241 @@ TEST(ConvTest, Conv2D_group) { TestConvOp(attrs, {X, W}, {X_shape, W_shape}, expected_vals, Y_shape, true); } +TEST(ConvTest, Depthwise2D_Bias_Group1_Issue18992) { + ConvOpAndTestAttributes attrs = { + "", // auto_pad + vector{1, 1}, // dilations + 1, // group + vector{1, 1}, // kernel_shape + vector{0, 0, 0, 0}, // pads + vector{1, 1}, // strides + {} // excluded EPs + }; + + vector X = {1.0f}; + vector X_shape = {1, 1, 1, 1}; + vector W = {0.5f}; + vector W_shape = {1, 1, 1, 1}; + vector B = {0.5f}; + vector B_shape = {1}; + vector Y_shape = {1, 1, 1, 1}; + auto expected_vals = {1.0f}; + + TestConvOp(attrs, {X, W, B}, {X_shape, W_shape, B_shape}, expected_vals, Y_shape); + TestConvOp(attrs, {X, W, B}, {X_shape, W_shape, B_shape}, expected_vals, Y_shape, true); +} + +TEST(ConvTest, Depthwise2D_Bias_Group2) { + ConvOpAndTestAttributes attrs = { + "", // auto_pad + vector{1, 1}, // dilations + 2, // group + vector{1, 1}, // kernel_shape + vector{0, 0, 0, 0}, // pads + vector{1, 1}, // strides + {} // excluded EPs + }; + + vector X = { + 0.0f, 1.0f, 2.0f, + 3.0f, 4.0f, 5.0f, + 6.0f, 7.0f, 8.0f, + + 9.0f, 10.0f, 11.0f, + 12.0f, 13.0f, 14.0f, + 15.0f, 16.0f, 17.0f}; + vector X_shape = {1, 2, 3, 3}; + vector W = {1.0f, 2.0f}; + vector W_shape = {2, 1, 1, 1}; + vector B = {1.0f, -1.0f}; + vector B_shape = {2}; + vector Y_shape = {1, 2, 3, 3}; + auto expected_vals = { + 1.0f, 2.0f, 3.0f, + 4.0f, 5.0f, 6.0f, + 7.0f, 8.0f, 9.0f, + + 17.0f, 19.0f, 21.0f, + 23.0f, 25.0f, 27.0f, + 29.0f, 31.0f, 33.0f}; + + TestConvOp(attrs, {X, W, B}, {X_shape, W_shape, B_shape}, expected_vals, Y_shape); + TestConvOp(attrs, {X, W, B}, {X_shape, W_shape, B_shape}, expected_vals, Y_shape, true); +} + +TEST(ConvTest, Depthwise2D_Bias_Group15) { + ConvOpAndTestAttributes attrs = { + "", // auto_pad + vector{1, 1}, // dilations + 15, // group + vector{2, 2}, // kernel_shape + vector{0, 0, 0, 0}, // pads + vector{1, 1}, // strides + {} // excluded EPs + }; + + vector X = { + // C = 0 + 0.0f, 1.0f, + 2.0f, 3.0f, + + // C = 1 + 4.0f, 5.0f, + 6.0f, 7.0f, + + // C = 2 + 8.0f, 9.0f, + 10.0f, 11.0f, + + // C = 3 + 12.0f, 13.0f, + 14.0f, 15.0f, + + // C = 4 + 16.0f, 17.0f, + 18.0f, 19.0f, + + // C = 5 + 20.0f, 21.0f, + 22.0f, 23.0f, + + // C = 6 + 24.0f, 25.0f, + 26.0f, 27.0f, + + // C = 7 + 28.0f, 29.0f, + 30.0f, 31.0f, + + // C = 8 + 32.0f, 33.0f, + 34.0f, 35.0f, + + // C = 9 + 36.0f, 37.0f, + 38.0f, 39.0f, + + // C = 10 + 40.0f, 41.0f, + 42.0f, 43.0f, + + // C = 11 + 44.0f, 45.0f, + 46.0f, 47.0f, + + // C = 12 + 48.0f, 49.0f, + 50.0f, 51.0f, + + // C = 13 + 52.0f, 53.0f, + 54.0f, 55.0f, + + // C = 14 + 56.0f, 57.0f, + 58.0f, 59.0f}; + vector X_shape = {1, 15, 2, 2}; + vector W = { + // M = 0 + 0.0f, 1.0f, + 2.0f, 3.0f, + + // M = 1 + 4.0f, 5.0f, + 6.0f, 7.0f, + + // M = 2 + 8.0f, 9.0f, + 10.0f, 11.0f, + + // M = 3 + 12.0f, 13.0f, + 14.0f, 15.0f, + + // M = 4 + 16.0f, 17.0f, + 18.0f, 19.0f, + + // M = 5 + 20.0f, 21.0f, + 22.0f, 23.0f, + + // M = 6 + 24.0f, 25.0f, + 26.0f, 27.0f, + + // M = 7 + 28.0f, 29.0f, + 30.0f, 31.0f, + + // M = 8 + 32.0f, 33.0f, + 34.0f, 35.0f, + + // M = 9 + 36.0f, 37.0f, + 38.0f, 39.0f, + + // M = 10 + 40.0f, 41.0f, + 42.0f, 43.0f, + + // M = 11 + 44.0f, 45.0f, + 46.0f, 47.0f, + + // M = 12 + 48.0f, 49.0f, + 50.0f, 51.0f, + + // M = 13 + 52.0f, 53.0f, + 54.0f, 55.0f, + + // M = 14 + 56.0f, 57.0f, + 58.0f, 59.0f}; + vector W_shape = {15, 1, 2, 2}; + vector B = { + 101.0f, + 102.0f, + 103.0f, + 104.0f, + 105.0f, + 106.0f, + 107.0f, + 108.0f, + 109.0f, + 110.0f, + 111.0f, + 112.0f, + 113.0f, + 114.0f, + 115.0f}; + vector B_shape = {15}; + vector Y_shape = {1, 15, 1, 1}; + auto expected_vals = { + 115.0f, // 0.0*0.0 + 1.0*1.0 + 2.0*2.0 + 3.0*3.0 + 101.0 + 228.0f, + 469.0f, + 838.0f, + 1335.0f, + 1960.0f, + 2713.0f, // 24.0*24.0 + 25.0*25.0 + 26.0*26.0 + 27.0*27.0 + 107.0 + 3594.0f, + 4603.0f, + 5740.0f, + 7005.0f, + 8398.0f, + 9919.0f, // 48.0*48.0 + 49.0*49.0 + 50.0*50.0 + 51.0*51.0 + 113.0 + 11568.0f, // 52.0*52.0 + 53.0*53.0 + 54.0*54.0 + 55.0*55.0 + 114.0 + 13345.0f // 56.0*56.0 + 57.0*57.0 + 58.0*58.0 + 59.0*59.0 + 115.0 + }; + + TestConvOp(attrs, {X, W, B}, {X_shape, W_shape, B_shape}, expected_vals, Y_shape); + TestConvOp(attrs, {X, W, B}, {X_shape, W_shape, B_shape}, expected_vals, Y_shape, true); +} + TEST(ConvTest, ConvDimWithZero) { ConvOpAndTestAttributes attrs = { "", // auto_pad