diff --git a/cmake/deps.txt b/cmake/deps.txt index aeb7c05080abb..575df832ac8b0 100644 --- a/cmake/deps.txt +++ b/cmake/deps.txt @@ -3,10 +3,15 @@ #The columns are separated by ";" because a list in cmake is just a ";" separated group of strings. #Names should be in lower case. They will be used as variable names in cmake. #URLs can be either https URLs or local file paths in cmake-style(directory separator is a forward slash character). -#SHA1 hashes can be generated by running sha1sum command. +#SHA1 hashes can be generated by running sha1sum command on linux. PowerShell can also be used: +# (Get-FileHash -Algorithm SHA1 ).Hash.ToLower() #If you need to change abseil's version to a different one, you may also want to update external\abseil-cpp.natvis #since the file contains a version string: "lts_20230802". However, the file is for debugging purposes only and would #not affect built binaries. +# +# NOTE: You must run deps_update_and_upload.py when ready to test your changes in a CI. +# See https://microsoft.sharepoint.com/teams/ONNX2/_layouts/OneNote.aspx?id=%2Fteams%2FONNX2%2FShared%20Documents%2FNotebooks%2FONNX%20Ecosystem%20Team%20Notebook&wd=target%28Development.one%7C63D3AB47-51D1-4A62-9965-66882234BD44%2FAdd%20or%20update%20a%20dependency%20in%20deps.txt%7C0E9ED71D-89D5-40FA-B05F-C0123289C591%2F%29 +# abseil_cpp;https://github.com/abseil/abseil-cpp/archive/refs/tags/20230802.0.zip;04271dfbfac59269b6939e1e9d5faf0d18a7ba91 cxxopts;https://github.com/jarro2783/cxxopts/archive/3c73d91c0b04e2b59462f0a741be8c07024c1bc0.zip;6c6ca7f8480b26c8d00476e0e24b7184717fe4f0 date;https://github.com/HowardHinnant/date/archive/refs/tags/v3.0.1.zip;2dac0c81dc54ebdd8f8d073a75c053b04b56e159 @@ -18,7 +23,7 @@ fxdiv;https://github.com/Maratyszcza/FXdiv/archive/63058eff77e11aa15bf531df5dd34 google_benchmark;https://github.com/google/benchmark/archive/refs/tags/v1.7.0.zip;e97c368b176e8614e3f1bf13dd9abcf6a7ad9908 google_nsync;https://github.com/google/nsync/archive/refs/tags/1.26.0.zip;5e7c00ef6bf5b787386fc040067903ec774e2752 googletest;https://github.com/google/googletest/archive/refs/tags/v1.14.0.zip;0ac421f2ec11af38b0fff0f1992184032731a8bc -googlexnnpack;https://github.com/google/XNNPACK/archive/003c580e696a774afdc984996ee909b7c8d8128c.zip;9f192e3f15e1e37ae9c78d53eeea47e45c5eb31c +googlexnnpack;https://github.com/google/XNNPACK/archive/0da379fc4808f9601faef392352018c741c0f297.zip;663883491e380b628e0a5b162b5f2658032fae73 json;https://github.com/nlohmann/json/archive/refs/tags/v3.10.5.zip;f257f8dc27c5b8c085dc887b40cddd18ae1f725c microsoft_gsl;https://github.com/microsoft/GSL/archive/refs/tags/v4.0.0.zip;cf368104cd22a87b4dd0c80228919bb2df3e2a14 microsoft_wil;https://github.com/microsoft/wil/archive/refs/tags/v1.0.230629.1.zip;e4a542a323c070376f7c2d1973d0f7ddbc1d2fa5 @@ -35,7 +40,7 @@ protoc_linux_x86;https://github.com/protocolbuffers/protobuf/releases/download/v protoc_linux_aarch64;https://github.com/protocolbuffers/protobuf/releases/download/v21.12/protoc-21.12-linux-aarch_64.zip;df9d45470b0b8cf939dd2f0ec6b88e9cafc4d617 protoc_mac_universal;https://github.com/protocolbuffers/protobuf/releases/download/v21.12/protoc-21.12-osx-universal_binary.zip;23710c3d1c2036d8d65a6a22234372fa2d7af9ef psimd;https://github.com/Maratyszcza/psimd/archive/072586a71b55b7f8c584153d223e95687148a900.zip;1f5454b01f06f9656b77e4a5e2e31d7422487013 -pthreadpool;https://github.com/Maratyszcza/pthreadpool/archive/1787867f6183f056420e532eec640cba25efafea.zip;e43e80781560c5ab404a4da20f34d846f5f5d101 +pthreadpool;https://github.com/Maratyszcza/pthreadpool/archive/4fe0e1e183925bf8cfa6aae24237e724a96479b8.zip;07a0aa91dd9bf86f31b95497e00f31d8a261a4bd pybind11;https://github.com/pybind/pybind11/archive/refs/tags/v2.10.1.zip;769b6aa67a77f17a770960f604b727645b6f6a13 pytorch_cpuinfo;https://github.com/pytorch/cpuinfo/archive/959002f82d7962a473d8bf301845f2af720e0aa4.zip;85da3caa60eb2b148613b443fbc2bfdc30689965 re2;https://github.com/google/re2/archive/refs/tags/2022-06-01.zip;aa77313b76e91b531ee7f3e45f004c6a502a5374 diff --git a/cmake/external/xnnpack.cmake b/cmake/external/xnnpack.cmake index 7455584f1a625..e661aa51bfc17 100644 --- a/cmake/external/xnnpack.cmake +++ b/cmake/external/xnnpack.cmake @@ -25,17 +25,23 @@ set(FXDIV_SOURCE_DIR ${fxdiv_SOURCE_DIR}) FetchContent_Declare(pthreadpool URL ${DEP_URL_pthreadpool} URL_HASH SHA1=${DEP_SHA1_pthreadpool}) onnxruntime_fetchcontent_makeavailable(pthreadpool) -FetchContent_Declare(googlexnnpack URL ${DEP_URL_googlexnnpack} URL_HASH SHA1=${DEP_SHA1_googlexnnpack} -PATCH_COMMAND ${Patch_EXECUTABLE} --binary --ignore-whitespace -p1 < ${PROJECT_SOURCE_DIR}/patches/xnnpack/AddEmscriptenAndIosSupport.patch) +FetchContent_Declare(googlexnnpack URL ${DEP_URL_googlexnnpack} URL_HASH SHA1=${DEP_SHA1_googlexnnpack} + PATCH_COMMAND ${Patch_EXECUTABLE} --binary --ignore-whitespace -p1 < ${PROJECT_SOURCE_DIR}/patches/xnnpack/AddEmscriptenAndIosSupport.patch + ) onnxruntime_fetchcontent_makeavailable(googlexnnpack) set(XNNPACK_DIR ${googlexnnpack_SOURCE_DIR}) set(XNNPACK_INCLUDE_DIR ${XNNPACK_DIR}/include) set(onnxruntime_EXTERNAL_LIBRARIES_XNNPACK XNNPACK pthreadpool) + # the XNNPACK CMake setup doesn't include the WASM kernels so we have to manually set those up if(CMAKE_SYSTEM_NAME STREQUAL "Emscripten") + # See source lists in _deps/googlexnnpack-src/BUILD.bazel for wasm_prod_microkernels + message("Adding WebAssembly Source Files to XNNPACK") + set(wasm_srcs "") + file(READ "${XNNPACK_DIR}/BUILD.bazel" xnnpack_bazel_config) # Replace newlines with semicolon so that it is treated as a list by CMake @@ -70,25 +76,23 @@ if(CMAKE_SYSTEM_NAME STREQUAL "Emscripten") set(${target_srcs} ${bazel_srcs} PARENT_SCOPE) endfunction() - GetSrcListFromBazel("PROD_SCALAR_WASM_MICROKERNEL_SRCS" prod_scalar_wasm_srcs) - GetSrcListFromBazel("ALL_WASM_MICROKERNEL_SRCS" all_wasm_srcs) - GetSrcListFromBazel("WASM32_ASM_MICROKERNEL_SRCS" wasm32_asm_srcs) + GetSrcListFromBazel("OPERATOR_SRCS" operator_srcs) + GetSrcListFromBazel("TABLE_SRCS" table_srcs) + list(APPEND wasm_srcs ${operator_srcs} ${table_srcs}) - message(DEBUG "prod_scalar_wasm_srcs: ${prod_scalar_wasm_srcs}\n") - message(DEBUG "all_wasm_srcs: ${all_wasm_srcs}\n") - message(DEBUG "wasm32_asm_srcs: ${wasm32_asm_srcs}\n") + # kernels + list(APPEND wasm_srcs ${XNNPACK_DIR}/src/amalgam/gen/scalar.c) + list(APPEND wasm_srcs ${XNNPACK_DIR}/src/amalgam/gen/wasm.c) - message("Adding WebAssembly Source Files to XNNPACK") - set(wasm_srcs "") - list(APPEND wasm_srcs ${prod_scalar_wasm_srcs}) - list(APPEND wasm_srcs ${all_wasm_srcs}) - list(APPEND wasm_srcs ${wasm32_asm_srcs}) + if(onnxruntime_ENABLE_WEBASSEMBLY_SIMD) + list(APPEND wasm_srcs ${XNNPACK_DIR}/src/amalgam/gen/wasmsimd.c) + target_compile_options(XNNPACK PRIVATE "-msimd128") + endif() + message(DEBUG "wasm_srcs: ${wasm_srcs}\n") target_sources(XNNPACK PRIVATE ${wasm_srcs}) - if(onnxruntime_ENABLE_WEBASSEMBLY_SIMD) - GetSrcListFromBazel("ALL_WASMSIMD_MICROKERNEL_SRCS" all_wasmsimd_srcs) - message(DEBUG "all_wasmsimd_srcs: ${all_wasmsimd_srcs}") - target_sources(XNNPACK PRIVATE ${all_wasmsimd_srcs}) - endif() + # add flags from BAZEL.build + target_compile_options(XNNPACK PRIVATE "-fno-fast-math") + target_compile_options(XNNPACK PRIVATE "-fno-math-errno") endif() diff --git a/cmake/onnxruntime.cmake b/cmake/onnxruntime.cmake index 6ccaf00499e95..9d9b006c595bb 100644 --- a/cmake/onnxruntime.cmake +++ b/cmake/onnxruntime.cmake @@ -282,44 +282,77 @@ endif() # Assemble the Apple static framework (iOS and macOS) if(onnxruntime_BUILD_APPLE_FRAMEWORK) + if(${CMAKE_SYSTEM_NAME} STREQUAL "iOS") + set(STATIC_FRAMEWORK_OUTPUT_DIR ${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_BUILD_TYPE}-${CMAKE_OSX_SYSROOT}) + else() # macOS + set(STATIC_FRAMEWORK_OUTPUT_DIR ${CMAKE_CURRENT_BINARY_DIR}) + endif() + + # Setup the various directories required. Remove any existing ones so we start with a clean directory. set(STATIC_LIB_DIR ${CMAKE_CURRENT_BINARY_DIR}/static_libraries) - file(MAKE_DIRECTORY ${STATIC_LIB_DIR}) + set(STATIC_LIB_TEMP_DIR ${STATIC_LIB_DIR}/temp) + add_custom_command(TARGET onnxruntime PRE_BUILD COMMAND ${CMAKE_COMMAND} -E rm -rf ${STATIC_LIB_DIR}) + add_custom_command(TARGET onnxruntime PRE_BUILD COMMAND ${CMAKE_COMMAND} -E make_directory ${STATIC_LIB_DIR}) + add_custom_command(TARGET onnxruntime PRE_BUILD COMMAND ${CMAKE_COMMAND} -E make_directory ${STATIC_LIB_TEMP_DIR}) - # Remove the existing files in the STATIC_LIB_DIR folder - file(GLOB _OLD_STATIC_LIBS ${STATIC_LIB_DIR}/*.a) - file(REMOVE "${_OLD_STATIC_LIBS}") + set(STATIC_FRAMEWORK_DIR ${STATIC_FRAMEWORK_OUTPUT_DIR}/static_framework/onnxruntime.framework) + add_custom_command(TARGET onnxruntime PRE_BUILD COMMAND ${CMAKE_COMMAND} -E rm -rf ${STATIC_FRAMEWORK_DIR}) + add_custom_command(TARGET onnxruntime PRE_BUILD COMMAND ${CMAKE_COMMAND} -E make_directory ${STATIC_FRAMEWORK_DIR}) + + # replicate XCode's Single Object Pre-Link + # link the internal onnxruntime .o files with the external .a files into a single relocatable object + # to enforce symbol visibility. doing it this way limits the symbols included from the .a files to symbols used + # by the ORT .o files. - # Go through all the static libraries, and create symbolic links - foreach(_LIB ${onnxruntime_INTERNAL_LIBRARIES} ${onnxruntime_EXTERNAL_LIBRARIES}) + # If it's an onnxruntime library, extract .o files to a separate directory for each library to avoid any clashes + # with filenames (e.g. utils.o) + foreach(_LIB ${onnxruntime_INTERNAL_LIBRARIES} ) GET_TARGET_PROPERTY(_LIB_TYPE ${_LIB} TYPE) if(_LIB_TYPE STREQUAL "STATIC_LIBRARY") - add_custom_command(TARGET onnxruntime POST_BUILD COMMAND ${CMAKE_COMMAND} -E create_symlink $ ${STATIC_LIB_DIR}/$) + set(CUR_STATIC_LIB_OBJ_DIR ${STATIC_LIB_TEMP_DIR}/$) + add_custom_command(TARGET onnxruntime POST_BUILD + COMMAND ${CMAKE_COMMAND} -E make_directory ${CUR_STATIC_LIB_OBJ_DIR}) + + add_custom_command(TARGET onnxruntime POST_BUILD + COMMAND ar ARGS -x $ + WORKING_DIRECTORY ${CUR_STATIC_LIB_OBJ_DIR}) endif() endforeach() - if(${CMAKE_SYSTEM_NAME} STREQUAL "iOS") - set(STATIC_FRAMEWORK_OUTPUT_DIR ${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_BUILD_TYPE}-${CMAKE_OSX_SYSROOT}) - else() # macOS - set(STATIC_FRAMEWORK_OUTPUT_DIR ${CMAKE_CURRENT_BINARY_DIR}) - endif() + # for external libraries we create a symlink to the .a file + foreach(_LIB ${onnxruntime_EXTERNAL_LIBRARIES}) + GET_TARGET_PROPERTY(_LIB_TYPE ${_LIB} TYPE) + if(_LIB_TYPE STREQUAL "STATIC_LIBRARY") + add_custom_command(TARGET onnxruntime POST_BUILD + COMMAND ${CMAKE_COMMAND} -E create_symlink + $ ${STATIC_LIB_DIR}/$) + endif() + endforeach() - # Assemble the static framework - set(STATIC_FRAMEWORK_DIR ${STATIC_FRAMEWORK_OUTPUT_DIR}/static_framework/onnxruntime.framework) - set(STATIC_FRAMEWORK_HEADER_DIR ${STATIC_FRAMEWORK_DIR}/Headers) - file(MAKE_DIRECTORY ${STATIC_FRAMEWORK_DIR}) - # Remove all files under STATIC_FRAMEWORK_DIR (if any) - file(GLOB_RECURSE _OLD_STATIC_FRAMEWORK ${STATIC_FRAMEWORK_DIR}/*.*) - file(REMOVE "${_OLD_STATIC_FRAMEWORK}") + # do the pre-link with `ld -r` to create a single relocatable object with correct symbol visibility + add_custom_command(TARGET onnxruntime POST_BUILD + COMMAND ld ARGS -r -o ${STATIC_LIB_DIR}/prelinked_objects.o */*.o ../*.a + WORKING_DIRECTORY ${STATIC_LIB_TEMP_DIR}) + + # create the static library + add_custom_command(TARGET onnxruntime POST_BUILD + COMMAND libtool -static -o ${STATIC_FRAMEWORK_DIR}/onnxruntime prelinked_objects.o + WORKING_DIRECTORY ${STATIC_LIB_DIR}) + # Assemble the other pieces of the static framework + add_custom_command(TARGET onnxruntime POST_BUILD + COMMAND ${CMAKE_COMMAND} -E + copy_if_different ${INFO_PLIST_PATH} ${STATIC_FRAMEWORK_DIR}/Info.plist) + + # add the framework header files + set(STATIC_FRAMEWORK_HEADER_DIR ${STATIC_FRAMEWORK_DIR}/Headers) file(MAKE_DIRECTORY ${STATIC_FRAMEWORK_HEADER_DIR}) - # copy the header files one by one, and the Info.plist foreach(h_ ${ONNXRUNTIME_PUBLIC_HEADERS}) get_filename_component(HEADER_NAME_ ${h_} NAME) - add_custom_command(TARGET onnxruntime POST_BUILD COMMAND ${CMAKE_COMMAND} -E copy_if_different ${h_} ${STATIC_FRAMEWORK_HEADER_DIR}/${HEADER_NAME_}) + add_custom_command(TARGET onnxruntime POST_BUILD + COMMAND ${CMAKE_COMMAND} -E + copy_if_different ${h_} ${STATIC_FRAMEWORK_HEADER_DIR}/${HEADER_NAME_}) endforeach() - add_custom_command(TARGET onnxruntime POST_BUILD COMMAND ${CMAKE_COMMAND} -E copy_if_different ${INFO_PLIST_PATH} ${STATIC_FRAMEWORK_DIR}/Info.plist) - # link the static library - add_custom_command(TARGET onnxruntime POST_BUILD COMMAND libtool -static -o ${STATIC_FRAMEWORK_DIR}/onnxruntime *.a WORKING_DIRECTORY ${STATIC_LIB_DIR}) endif() diff --git a/cmake/onnxruntime_graph.cmake b/cmake/onnxruntime_graph.cmake index 735c86956ec4f..3f532ec2c3261 100644 --- a/cmake/onnxruntime_graph.cmake +++ b/cmake/onnxruntime_graph.cmake @@ -20,6 +20,8 @@ if (onnxruntime_MINIMAL_BUILD) "${ONNXRUNTIME_ROOT}/core/graph/contrib_ops/onnx_deprecated_operators.cc" "${ONNXRUNTIME_ROOT}/core/graph/contrib_ops/onnx_function_util.h" "${ONNXRUNTIME_ROOT}/core/graph/contrib_ops/onnx_function_util.cc" + "${ONNXRUNTIME_ROOT}/core/graph/contrib_ops/shape_inference_functions.h" + "${ONNXRUNTIME_ROOT}/core/graph/contrib_ops/shape_inference_functions.cc" "${ONNXRUNTIME_ROOT}/core/graph/function_template.h" "${ONNXRUNTIME_ROOT}/core/graph/function_utils.h" "${ONNXRUNTIME_ROOT}/core/graph/function_utils.cc" diff --git a/cmake/onnxruntime_providers_tensorrt.cmake b/cmake/onnxruntime_providers_tensorrt.cmake index 6af6fed69f051..686a993de3a4a 100644 --- a/cmake/onnxruntime_providers_tensorrt.cmake +++ b/cmake/onnxruntime_providers_tensorrt.cmake @@ -57,12 +57,14 @@ URL ${DEP_URL_onnx_tensorrt} URL_HASH SHA1=${DEP_SHA1_onnx_tensorrt} ) + if (NOT CUDA_INCLUDE_DIR) + set(CUDA_INCLUDE_DIR ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}) # onnx-tensorrt repo needs this variable to build + endif() # The onnx_tensorrt repo contains a test program, getSupportedAPITest, which doesn't support Windows. It uses # unistd.h. So we must exclude it from our build. onnxruntime_fetchcontent_makeavailable is for the purpose. onnxruntime_fetchcontent_makeavailable(onnx_tensorrt) include_directories(${onnx_tensorrt_SOURCE_DIR}) set(CMAKE_CXX_FLAGS ${OLD_CMAKE_CXX_FLAGS}) - set(CUDA_INCLUDE_DIR ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}) # onnx-tensorrt repo needs this variable to build if ( CMAKE_COMPILER_IS_GNUCC ) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-unused-parameter") endif() diff --git a/cmake/onnxruntime_providers_xnnpack.cmake b/cmake/onnxruntime_providers_xnnpack.cmake index 30ae90e6564cc..9c00703ca0846 100644 --- a/cmake/onnxruntime_providers_xnnpack.cmake +++ b/cmake/onnxruntime_providers_xnnpack.cmake @@ -15,7 +15,8 @@ source_group(TREE ${REPO_ROOT} FILES ${onnxruntime_providers_xnnpack_cc_srcs}) onnxruntime_add_static_library(onnxruntime_providers_xnnpack ${onnxruntime_providers_xnnpack_cc_srcs}) onnxruntime_add_include_to_target(onnxruntime_providers_xnnpack - onnxruntime_common onnxruntime_framework onnx onnx_proto ${PROTOBUF_LIB} XNNPACK pthreadpool flatbuffers::flatbuffers Boost::mp11 safeint_interface + onnxruntime_common onnxruntime_framework onnx onnx_proto ${PROTOBUF_LIB} XNNPACK pthreadpool + flatbuffers::flatbuffers Boost::mp11 safeint_interface ) add_dependencies(onnxruntime_providers_xnnpack onnx ${onnxruntime_EXTERNAL_DEPENDENCIES}) @@ -35,4 +36,4 @@ # there are some in builds where sizeof(size_t) != sizeof(int64_t), e.g., in 'ONNX Runtime Web CI Pipeline' if (HAS_SHORTEN_64_TO_32 AND NOT CMAKE_SIZEOF_VOID_P EQUAL 8) target_compile_options(onnxruntime_providers_xnnpack PRIVATE -Wno-error=shorten-64-to-32) - endif() \ No newline at end of file + endif() diff --git a/cmake/onnxruntime_unittests.cmake b/cmake/onnxruntime_unittests.cmake index f5f98066675fb..bdb0230a8ebd0 100644 --- a/cmake/onnxruntime_unittests.cmake +++ b/cmake/onnxruntime_unittests.cmake @@ -41,7 +41,7 @@ function(AddTest) if (MSVC) target_compile_options(${_UT_TARGET} PRIVATE "$<$:SHELL:--compiler-options /wd6330>" "$<$>:/wd6330>") - #Abseil has a lot of C4127/C4324 warnings. + #Abseil has a lot of C4127/C4324 warnings. target_compile_options(${_UT_TARGET} PRIVATE "$<$:SHELL:--compiler-options /wd4127>" "$<$>:/wd4127>") target_compile_options(${_UT_TARGET} PRIVATE "$<$:SHELL:--compiler-options /wd4324>" @@ -201,8 +201,18 @@ function(AddTest) list(APPEND TEST_NODE_FLAGS "--experimental-wasm-simd") endif() + # prefer Node from emsdk so the version is more deterministic + if (DEFINED ENV{EMSDK_NODE}) + set(NODE_EXECUTABLE $ENV{EMSDK_NODE}) + else() + # warning as we don't know what node version is being used and whether things like the TEST_NODE_FLAGS + # will be valid. e.g. "--experimental-wasm-simd" is not valid with node v20 or later. + message(WARNING "EMSDK_NODE environment variable was not set. Falling back to system `node`.") + set(NODE_EXECUTABLE node) + endif() + add_test(NAME ${_UT_TARGET} - COMMAND node ${TEST_NODE_FLAGS} ${_UT_TARGET}.js ${TEST_ARGS} + COMMAND ${NODE_EXECUTABLE} ${TEST_NODE_FLAGS} ${_UT_TARGET}.js ${TEST_ARGS} WORKING_DIRECTORY $ ) endif() diff --git a/cmake/onnxruntime_webassembly.cmake b/cmake/onnxruntime_webassembly.cmake index c6510c97a617e..9014089cb6112 100644 --- a/cmake/onnxruntime_webassembly.cmake +++ b/cmake/onnxruntime_webassembly.cmake @@ -192,8 +192,13 @@ else() onnxruntime_util re2::re2 ) + + set(EXPORTED_RUNTIME_METHODS "'stackAlloc','stackRestore','stackSave','UTF8ToString','stringToUTF8','lengthBytesUTF8'") + if (onnxruntime_USE_XNNPACK) target_link_libraries(onnxruntime_webassembly PRIVATE XNNPACK) + string(APPEND EXPORTED_RUNTIME_METHODS ",'addFunction'") + target_link_options(onnxruntime_webassembly PRIVATE "SHELL:-s ALLOW_TABLE_GROWTH=1") endif() if(onnxruntime_USE_WEBNN) @@ -204,7 +209,6 @@ else() target_link_libraries(onnxruntime_webassembly PRIVATE tensorboard) endif() - set(EXPORTED_RUNTIME_METHODS "['stackAlloc','stackRestore','stackSave','UTF8ToString','stringToUTF8','lengthBytesUTF8']") if (onnxruntime_USE_JSEP) set(EXPORTED_FUNCTIONS "_malloc,_free,_JsepOutput,_JsepGetNodeName") else() @@ -212,7 +216,7 @@ else() endif() target_link_options(onnxruntime_webassembly PRIVATE - "SHELL:-s EXPORTED_RUNTIME_METHODS=${EXPORTED_RUNTIME_METHODS}" + "SHELL:-s EXPORTED_RUNTIME_METHODS=[${EXPORTED_RUNTIME_METHODS}]" "SHELL:-s EXPORTED_FUNCTIONS=${EXPORTED_FUNCTIONS}" "SHELL:-s MAXIMUM_MEMORY=4294967296" "SHELL:-s EXIT_RUNTIME=0" diff --git a/cmake/patches/xnnpack/AddEmscriptenAndIosSupport.patch b/cmake/patches/xnnpack/AddEmscriptenAndIosSupport.patch index 37bdbf9fb53f6..460b4d97c499b 100644 --- a/cmake/patches/xnnpack/AddEmscriptenAndIosSupport.patch +++ b/cmake/patches/xnnpack/AddEmscriptenAndIosSupport.patch @@ -1,66 +1,27 @@ diff --git a/CMakeLists.txt b/CMakeLists.txt -index d53c48aa1..77c3cf983 100755 +index dba9b4687..bcaa18ad7 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt -@@ -105,22 +105,12 @@ ENDIF() - +@@ -122,7 +122,7 @@ ENDIF() + # ---[ Build flags IF(NOT CMAKE_SYSTEM_NAME) MESSAGE(FATAL_ERROR "CMAKE_SYSTEM_NAME not defined") --ELSEIF(NOT CMAKE_SYSTEM_NAME MATCHES "^(Darwin|Linux|Android|Windows|CYGWIN|MSYS)$") -+ELSEIF(NOT CMAKE_SYSTEM_NAME MATCHES "^(Darwin|Linux|Android|Windows|CYGWIN|MSYS|Emscripten|iOS)$") - MESSAGE(FATAL_ERROR "Unrecognized CMAKE_SYSTEM_NAME = ${CMAKE_SYSTEM_NAME}") +-ELSEIF(NOT CMAKE_SYSTEM_NAME MATCHES "^(Android|Darwin|iOS|Linux|Windows|CYGWIN|MSYS|QURT)$") ++ELSEIF(NOT CMAKE_SYSTEM_NAME MATCHES "^(Android|Darwin|iOS|Linux|Windows|CYGWIN|MSYS|QURT|Emscripten|iOS)$") + MESSAGE(FATAL_ERROR "Unrecognized CMAKE_SYSTEM_NAME value \"${CMAKE_SYSTEM_NAME}\"") ENDIF() - - # ---[ Download deps - IF(NOT XNNPACK_USE_SYSTEM_LIBS) -- IF(NOT DEFINED CLOG_SOURCE_DIR) -- MESSAGE(STATUS "Downloading clog to ${CMAKE_BINARY_DIR}/clog-source (define CLOG_SOURCE_DIR to avoid it)") -- CONFIGURE_FILE(cmake/DownloadCLog.cmake "${CMAKE_BINARY_DIR}/clog-download/CMakeLists.txt") -- EXECUTE_PROCESS(COMMAND "${CMAKE_COMMAND}" -G "${CMAKE_GENERATOR}" . -- WORKING_DIRECTORY "${CMAKE_BINARY_DIR}/clog-download") -- EXECUTE_PROCESS(COMMAND "${CMAKE_COMMAND}" --build . -- WORKING_DIRECTORY "${CMAKE_BINARY_DIR}/clog-download") -- SET(CLOG_SOURCE_DIR "${CMAKE_BINARY_DIR}/clog-source" CACHE STRING "clog source directory") -- ENDIF() -- - IF(NOT DEFINED CPUINFO_SOURCE_DIR) - MESSAGE(STATUS "Downloading cpuinfo to ${CMAKE_BINARY_DIR}/cpuinfo-source (define CPUINFO_SOURCE_DIR to avoid it)") - CONFIGURE_FILE(cmake/DownloadCpuinfo.cmake "${CMAKE_BINARY_DIR}/cpuinfo-download/CMakeLists.txt") -@@ -7108,6 +7098,10 @@ IF(MSVC) - SET_PROPERTY(SOURCE ${ALL_MICROKERNEL_SRCS} APPEND_STRING PROPERTY COMPILE_FLAGS "$<$>: /O2 >") - SET_PROPERTY(SOURCE ${HOT_SRCS} APPEND_STRING PROPERTY COMPILE_FLAGS "$<$>: /O2 >") - SET_PROPERTY(SOURCE ${COLD_SRCS} APPEND_STRING PROPERTY COMPILE_FLAGS "$<$>: /O1 >") -+ELSEIF(CMAKE_GENERATOR STREQUAL Xcode) -+ TARGET_COMPILE_OPTIONS(all_microkernels PRIVATE $<$>: -O2 >) -+ TARGET_COMPILE_OPTIONS(XNNPACK PRIVATE $<$>: -O2 >) -+ TARGET_COMPILE_OPTIONS(XNNPACK PRIVATE $<$>: -Os >) - ELSE() - SET_PROPERTY(SOURCE ${ALL_MICROKERNEL_SRCS} APPEND_STRING PROPERTY COMPILE_FLAGS "$<$>: -O2 >") - SET_PROPERTY(SOURCE ${HOT_SRCS} APPEND_STRING PROPERTY COMPILE_FLAGS "$<$>: -O2 >") -@@ -7142,26 +7136,6 @@ IF(LIBM) - TARGET_LINK_LIBRARIES(indirection PRIVATE ${LIBM}) + IF(CMAKE_SYSTEM_NAME MATCHES "Windows") +@@ -534,7 +534,12 @@ IF(XNNPACK_BUILD_LIBRARY) + TARGET_LINK_LIBRARIES(operator-utils PRIVATE logging) + TARGET_LINK_LIBRARIES(post-operation PRIVATE logging) + TARGET_LINK_LIBRARIES(subgraph PRIVATE allocator logging memory mutex operators operator-run) +- TARGET_LINK_LIBRARIES(XNNPACK PRIVATE allocator cache hardware-config indirection jit logging memory microkernel-utils microparams-init mutex normalization operators operator-run operator-utils packing post-operation microkernels-prod subgraph) ++ IF(CMAKE_SYSTEM_NAME STREQUAL "Emscripten") ++ # omit microkernels-prod as the list is manually created by ORT in cmake/external/xnnpack.cmake ++ TARGET_LINK_LIBRARIES(XNNPACK PRIVATE allocator cache hardware-config indirection jit logging memory microkernel-utils microparams-init mutex normalization operators operator-run operator-utils packing post-operation subgraph) ++ ELSE() ++ TARGET_LINK_LIBRARIES(XNNPACK PRIVATE allocator cache hardware-config indirection jit logging memory microkernel-utils microparams-init mutex normalization operators operator-run operator-utils packing post-operation microkernels-prod subgraph) ++ ENDIF() + SET_TARGET_PROPERTIES(XNNPACK PROPERTIES C_EXTENSIONS YES) ENDIF() - --# ---[ Configure clog --IF(NOT TARGET clog) -- IF(NOT XNNPACK_USE_SYSTEM_LIBS) -- SET(CLOG_BUILD_TESTS OFF CACHE BOOL "") -- SET(CLOG_RUNTIME_TYPE "${CPUINFO_RUNTIME_TYPE}" CACHE STRING "") -- ADD_SUBDIRECTORY( -- "${CLOG_SOURCE_DIR}/deps/clog" -- "${CMAKE_BINARY_DIR}/clog") -- # We build static version of clog but a dynamic library may indirectly depend on it -- SET_PROPERTY(TARGET clog PROPERTY POSITION_INDEPENDENT_CODE ON) -- ELSE() -- ADD_LIBRARY(clog STATIC IMPORTED) -- FIND_LIBRARY(CLOG_LIBRARY clog) -- IF(NOT CLOG_LIBRARY) -- MESSAGE(FATAL_ERROR "Cannot find clog") -- ENDIF() -- SET_PROPERTY(TARGET clog PROPERTY IMPORTED_LOCATION "${CLOG_LIBRARY}") -- ENDIF() --ENDIF() -- - # ---[ Configure cpuinfo - IF(NOT TARGET cpuinfo) - IF(NOT XNNPACK_USE_SYSTEM_LIBS) + IF(NOT MSVC) diff --git a/csharp/OnnxRuntime.CSharp.proj b/csharp/OnnxRuntime.CSharp.proj index 0288d752d8749..69bfd9896f1e4 100644 --- a/csharp/OnnxRuntime.CSharp.proj +++ b/csharp/OnnxRuntime.CSharp.proj @@ -17,9 +17,13 @@ CMake creates a target to this project x64 false - false + true + true None + + true + .. ..\tools\nuget\generate_nuspec_for_native_nuget.py @@ -30,13 +34,15 @@ CMake creates a target to this project ..\build\Linux $(OnnxRuntimeBuildDirectory)\packages $(OnnxRuntimeBuildDirectory)\$(Configuration) + python3 - + ..\build\Windows $(OnnxRuntimeBuildDirectory)\packages $(OnnxRuntimeBuildDirectory)\$(Configuration)\$(Configuration) + python @@ -86,28 +92,48 @@ CMake creates a target to this project - + + + Properties="NoBuild=true;Platform=AnyCPU;PackageVersion=$(PackageVersion);OrtPackageId=$(OrtPackageId);IncludeMobileTargets=$(IncludeMobileTargets)"/> - - + + + - - + + + - + + + + - diff --git a/csharp/src/Microsoft.ML.OnnxRuntime/Microsoft.ML.OnnxRuntime.csproj b/csharp/src/Microsoft.ML.OnnxRuntime/Microsoft.ML.OnnxRuntime.csproj index 29ccf55f081d5..0c74a23204d4f 100644 --- a/csharp/src/Microsoft.ML.OnnxRuntime/Microsoft.ML.OnnxRuntime.csproj +++ b/csharp/src/Microsoft.ML.OnnxRuntime/Microsoft.ML.OnnxRuntime.csproj @@ -4,66 +4,53 @@ Microsoft.ML.OnnxRuntime - - PreNet6 - netstandard2.0;netcoreapp3.1;net6.0 + true + netstandard2.0 + - - - xamarinios10;monoandroid11.0 + + + false - - monoandroid11.0 - + + NOTE: We include in a build of the managed package when creating Microsoft.ML.OnnxRuntime.Gpu as both + the CPU and GPU packaging pipelines can publish Microsoft.ML.OnnxRuntime.Managed, and we need the targets + to be consistent in both. + --> - net6.0;net6.0-android;net6.0-ios;net6.0-macos + '$(OrtPackageId)' == 'Microsoft.ML.OnnxRuntime.Gpu') AND + '$(IncludeMobileTargets)' == 'true' AND + Exists('$(MSBuildExtensionsPath)\Xamarin\Android') AND + Exists('$(MSBuildExtensionsPath)\Xamarin\iOS')"> + xamarinios10;monoandroid11.0 - - net6.0;net6.0-android + + monoandroid11.0 - - $(BaseTargets);$(XamarinTargets);$(XamarinTargetsForTraining) + + + $(MobileTargets);net6.0-android;net6.0-ios - - $(Net6Targets);$(Net6TargetsForTrainingPackage) + + $(MobileTargets);net6.0-android - - - $(BaseTargets);$(XamarinTargets);$(XamarinTargetsForTraining);$(Net6Targets);$(Net6TargetsForTrainingPackage) + + $(BaseTargets);$(MobileTargets) - AnyCPU;x86 default @@ -204,8 +191,9 @@ $(DefineConstants);$(OrtConstants) - + @@ -214,7 +202,6 @@ - --> + + + netstandard2.0 + $(OnnxRuntimeBuildDirectory)/NativeNuget.nuspec + + diff --git a/js/web/docs/webgpu-operators.md b/js/web/docs/webgpu-operators.md index 5b94a4a510934..0b82a9c031baa 100644 --- a/js/web/docs/webgpu-operators.md +++ b/js/web/docs/webgpu-operators.md @@ -20,15 +20,15 @@ Do not modify directly.* | Asinh | ai.onnx(9+) | | | Atan | ai.onnx(7+) | | | Atanh | ai.onnx(9+) | | -| AveragePool | ai.onnx(7-9,10,11+); com.ms.internal.nhwc(11+) | need perf optimization; need implementing activation | +| AveragePool | ai.onnx(7-9,10,11+); com.ms.internal.nhwc(7-9,10,11+) | need perf optimization; need implementing activation | | BiasAdd | com.microsoft(1+) | | | BiasSplitGelu | com.microsoft(1+) | | | Cast | ai.onnx(6-8,9-12,13-18,19+) | | | Ceil | ai.onnx(6-12,13+) | | | Clip | ai.onnx(6-10,11,12,13+) | | | Concat | ai.onnx(1-3,4-10,11-12,13+) | | -| Conv | ai.onnx(1-10,11+); com.ms.internal.nhwc(11+) | need perf optimization; conv3d is not supported; need implementing activation | -| ConvTranspose | ai.onnx(1-10,11+); com.ms.internal.nhwc(11+) | need perf optimization; ConvTranspose3d is not supported; need implementing activation | +| Conv | ai.onnx(1-10,11+); com.ms.internal.nhwc(1-10,11+) | need perf optimization; conv3d is not supported; need implementing activation | +| ConvTranspose | ai.onnx(1-10,11+); com.ms.internal.nhwc(1-10,11+) | need perf optimization; ConvTranspose3d is not supported; need implementing activation | | Cos | ai.onnx(7+) | | | Cosh | ai.onnx(9+) | | | Div | ai.onnx(7-12,13,14+) | | @@ -57,7 +57,7 @@ Do not modify directly.* | LessOrEqual | ai.onnx(12-15,16+) | | | Log | ai.onnx(6-12,13+) | | | MatMul | ai.onnx(1-12,13+) | | -| MaxPool | ai.onnx(1-7,8-9,10,11,12+); com.ms.internal.nhwc(11,12+) | need perf optimization; need implementing activation | +| MaxPool | ai.onnx(1-7,8-9,10,11,12+); com.ms.internal.nhwc(1-7,8-9,10,11,12+) | need perf optimization; need implementing activation | | MemcpyFromHost | ai.onnx(1+) | | | MemcpyToHost | ai.onnx(1+) | | | Mul | ai.onnx(7-12,13,14+) | | @@ -79,7 +79,7 @@ Do not modify directly.* | ReduceSumSquare | ai.onnx(1-10,11-12,13-17,18+) | | | Relu | ai.onnx(6-12,13,14+) | | | Reshape | ai.onnx(5-12,13,14+) | no GPU kernel | -| Resize | ai.onnx(10,11-12,13-17,18,19+); com.ms.internal.nhwc(11-12,13-17,18,19+) | CoordinateTransformMode align_corners is not supported with downsampling | +| Resize | ai.onnx(10,11-12,13-17,18,19+); com.ms.internal.nhwc(10,11-12,13-17,18,19+) | CoordinateTransformMode align_corners is not supported with downsampling | | Shape | ai.onnx(1-12,13-14,15+) | no GPU kernel; an ORT warning is generated - need to fix | | Sigmoid | ai.onnx(6-12,13+) | | | Sin | ai.onnx(7+) | | diff --git a/js/web/lib/wasm/jsep/webgpu/ops/unary-op.ts b/js/web/lib/wasm/jsep/webgpu/ops/unary-op.ts index bead3e72f63c7..4238449f9246f 100644 --- a/js/web/lib/wasm/jsep/webgpu/ops/unary-op.ts +++ b/js/web/lib/wasm/jsep/webgpu/ops/unary-op.ts @@ -29,12 +29,12 @@ const createElementwiseProgramShader = const output = outputVariable('outputData', outputDataType, [vecSize], 4); return ` - ${shaderHelper.declareVariables(input, output)} + ${shaderHelper.registerUniform('vec_size', 'u32').declareVariables(input, output)} ${additionalImplementation ?? ''} ${shaderHelper.mainStart()} - ${shaderHelper.guardAgainstOutOfBoundsWorkgroupSizes(vecSize)} + ${shaderHelper.guardAgainstOutOfBoundsWorkgroupSizes('uniforms.vec_size')} let a = ${input.getByOffset('global_idx')}; ${output.setByOffset('global_idx', expression)} @@ -45,13 +45,16 @@ const createElementwiseProgramInfo = (input: TensorView, name: string, funcCall: ElementwiseFunctionCall, additionalImplementation?: string, cacheKey?: string, outputDataType: number = input.dataType): ProgramInfo => ({ name, - shaderCache: {hint: cacheKey}, + shaderCache: {hint: cacheKey, inputDependencies: ['type']}, getShaderSource: shaderHelper => createElementwiseProgramShader( shaderHelper, ShapeUtil.size(input.dims), input.dataType, outputDataType, funcCall, additionalImplementation), getRunData: (inputTensors) => ({ outputs: [{dims: input.dims, dataType: outputDataType}], dispatchGroup: - {x: Math.ceil(ShapeUtil.size(inputTensors[0].dims) / 64 /* workgroup size */ / 4 /* vec size */)} + {x: Math.ceil(ShapeUtil.size(inputTensors[0].dims) / 64 /* workgroup size */ / 4 /* vec size */)}, + programUniforms: [ + {type: 'uint32', data: Math.ceil(ShapeUtil.size(input.dims) / 4)}, + ], }) }); diff --git a/js/web/test/test-runner.ts b/js/web/test/test-runner.ts index 628e5408150f8..29acc07e118f9 100644 --- a/js/web/test/test-runner.ts +++ b/js/web/test/test-runner.ts @@ -164,7 +164,10 @@ async function initializeSession( session = await ort.InferenceSession.create(modelFilePath, sessionConfig); } } catch (e) { - Logger.error('TestRunner', `Failed to load model from file: ${modelFilePath}. Error: ${inspect(e)}`); + Logger.error( + 'TestRunner', + `Failed to load model from file: ${modelFilePath}. ` + + `Error: ${e.message} @ ${e.fileName}:${e.lineNumber}`); throw e; } diff --git a/onnxruntime/contrib_ops/cpu/quantization/blockwise_quant_block.h b/onnxruntime/contrib_ops/cpu/quantization/blockwise_quant_block.h deleted file mode 100644 index 11b5447d65ed2..0000000000000 --- a/onnxruntime/contrib_ops/cpu/quantization/blockwise_quant_block.h +++ /dev/null @@ -1,129 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#pragma once - -#include -#include -#include - -namespace onnxruntime { -namespace contrib { - -#if defined(_MSC_VER) -#define FORCEINLINE __forceinline -#else -#define FORCEINLINE __attribute__((always_inline)) inline -#endif - -template -struct alignas(1) BlockwiseQuantBlock { - static_assert(block_size % 8 == 0); - - uint8_t blob_data[block_size / 8 * bits]; - - FORCEINLINE void dequant(T* dst, T scale, int32_t k_idx, int32_t K) const; - FORCEINLINE void dequant(T* dst, T scale, uint8_t zp, int32_t k_idx, int32_t K) const; - - FORCEINLINE void quant(const T* src, T& scale, int32_t k_idx, int32_t K, int32_t N); - FORCEINLINE void quant(const T* src, T& scale, uint8_t& zp, int32_t k_idx, int32_t K, int32_t N); -}; - -template -struct alignas(1) BlockwiseQuantBlock { - static_assert(block_size % 8 == 0); - - uint8_t blob_data[block_size / 2]; - - FORCEINLINE void dequant(T* dst, T scale, uint8_t zp, int32_t k_idx, int32_t K) const { - for (int i = 0; i < block_size; i += 2) { - T zp_t = static_cast(float(zp)); - if (k_idx + i < K) { - T x0 = static_cast(float(blob_data[i / 2] & 0xF)); - dst[i] = scale * (x0 - zp_t); - } - if (k_idx + i + 1 < K) { - T x1 = static_cast(float(blob_data[i / 2] >> 4)); - dst[i + 1] = scale * (x1 - zp_t); - } - } - } - - FORCEINLINE void dequant(T* dst, T scale, int32_t k_idx, int32_t K) const { - constexpr uint8_t zp = 8; - dequant(dst, scale, zp, k_idx, K); - } - - FORCEINLINE void quant(const T* src, T& scale_block, uint8_t& zp, int32_t k_idx, int32_t K, int32_t N) { - float min = static_cast(*src); - float max = static_cast(*src); - int32_t klen = std::min(block_size, K - k_idx); - for (int32_t kk = 0; kk < klen; kk++) { - const float v = static_cast(src[N * kk]); - if (v < min) min = v; - if (v > max) max = v; - } - min = std::min(min, 0.0f); - max = std::max(max, 0.0f); - - const float scale = (max - min) / ((1 << 4) - 1); - scale_block = static_cast(scale); - - const float reciprocal_scale = scale ? 1.0f / scale : 0.0f; - float zero_point_fp = min; - if (scale != 0.0f) { - zero_point_fp = 0.f - min / scale; - } - - // Handle any clamping - if (zero_point_fp < 0.0f) { - zp = 0; - } else if (zero_point_fp > 15.0f) { - zp = 15; - } else { - zp = (uint8_t)roundf(zero_point_fp); - } - - for (int32_t kk = 0; kk < klen; kk += 2) { - const float v0 = static_cast(src[N * kk]); - const uint8_t vi0 = (uint8_t)std::min(15.0f, std::max(0.0f, roundf(v0 * reciprocal_scale + zp))); - - const float v1 = static_cast((kk + 1 < klen) ? src[N * (kk + 1)] : 0.f); - const uint8_t vi1 = (uint8_t)std::min(15.0f, std::max(0.0f, roundf(v1 * reciprocal_scale + zp))); - - blob_data[kk / 2] = vi0 | (vi1 << 4); - } - } - - FORCEINLINE void quant(const T* src, T& scale_block, int32_t k_idx, int32_t K, int32_t N) { - float amax = 0.0f; // abs(max) - float max = 0.0f; - - int32_t klen = std::min(block_size, K - k_idx); - - for (int32_t kk = 0; kk < klen; kk++) { - const float v = static_cast(src[N * kk]); - if (amax < fabsf(v)) { - amax = fabsf(v); - max = v; - } - } - - const float scale = max / (-8.f); - scale_block = static_cast(scale); - const float reciprocal_scale = scale ? 1.0f / scale : 0.0f; - - for (int32_t kk = 0; kk < klen; kk += 2) { - const float v0 = src[N * kk] * reciprocal_scale; - const uint8_t vi0 = (uint8_t)std::min(15.0f, std::max(0.0f, roundf(v0 + 8.f))); - - const float v1 = (kk + 1 < klen) ? src[N * (kk + 1)] * reciprocal_scale : 0; - const uint8_t vi1 = (uint8_t)std::min(15.0f, std::max(0.0f, roundf(v1 + 8.f))); - - blob_data[kk / 2] = vi0 | (vi1 << 4); - } - } -}; - -} // namespace contrib -} // namespace onnxruntime diff --git a/onnxruntime/contrib_ops/cpu/quantization/dequantize_blockwise.h b/onnxruntime/contrib_ops/cpu/quantization/dequantize_blockwise.h deleted file mode 100644 index 8811e5649fc19..0000000000000 --- a/onnxruntime/contrib_ops/cpu/quantization/dequantize_blockwise.h +++ /dev/null @@ -1,174 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#pragma once - -#include "blockwise_quant_block.h" - -#include - -#include "core/common/safeint.h" -#include "core/framework/float16.h" -#include "core/platform/threadpool.h" -#include - -namespace onnxruntime { -namespace contrib { - -template -void QuantizeBlockwise( - uint8_t* dst, // shape: [ N, block_per_K, block_blob_size ] - const T* src, // shape: [K, N] - T* scale, // shape: [N * block_per_K] - uint8_t* zero_points, // shape: [N * block_per_K] if bits > 4 else [(N *block_per_K + 1) / 2] - int32_t N, - int32_t K, - onnxruntime::concurrency::ThreadPool* thread_pool) { - BlockwiseQuantBlock* dst_blob = - reinterpret_cast*>(dst); - - int32_t block_per_K = (K + block_size - 1) / block_size; - int32_t total_block_count = N * block_per_K; - - std::vector zero_points_tmp; // to avoid race condition - (void)zero_points_tmp; - uint8_t* zero_points_tmp_ptr = zero_points; - if (bits <= 4 && zero_points != nullptr) { - zero_points_tmp.resize(total_block_count, 0); - zero_points_tmp_ptr = zero_points_tmp.data(); - } - - concurrency::ThreadPool::TryBatchParallelFor( - thread_pool, - total_block_count, - [&](ptrdiff_t block_idx) { - int32_t n = static_cast(block_idx / block_per_K); - int32_t k_block_idx = static_cast(block_idx % block_per_K); - int32_t k = k_block_idx * block_size; - BlockwiseQuantBlock* blob_ptr = dst_blob + block_idx; - size_t offset = SafeInt(k) * N + n; - if (nullptr != zero_points_tmp_ptr) { - blob_ptr->quant(src + offset, scale[block_idx], zero_points_tmp_ptr[block_idx], k, K, N); - } else { - blob_ptr->quant(src + offset, scale[block_idx], k, K, N); - } - }, - 0); - - if (bits <= 4 && zero_points != nullptr) { // compact zero points - for (int32_t zp_idx = 0; zp_idx < total_block_count / 2; zp_idx++) { - zero_points[zp_idx] = ((zero_points_tmp[zp_idx * 2]) | (zero_points_tmp[zp_idx * 2 + 1] << 4)); - } - if (total_block_count & 1) { - zero_points[total_block_count / 2] = (zero_points[total_block_count / 2] & 0xf0) | zero_points_tmp[total_block_count - 1]; - } - } -} - -#define QuantizeBlockwise4Bits(block_size) \ - QuantizeBlockwise(dst, src, scale, zero_points, N, K, thread_pool); - -template -void QuantizeBlockwise( - uint8_t* dst, // shape: [ N, block_per_K, block_blob_size ] - const T* src, // shape: [K, N] - T* scale, // shape: [N, block_per_K] - uint8_t* zero_points, // shape: [N, block_per_K] - int32_t block_size, - int32_t bits, - int32_t N, - int32_t K, - onnxruntime::concurrency::ThreadPool* thread_pool) { - ORT_ENFORCE(bits == 4, "only 4 bits is supported now"); - - if (16 == block_size) { - QuantizeBlockwise4Bits(16); - } else if (32 == block_size) { - QuantizeBlockwise4Bits(32); - } else if (64 == block_size) { - QuantizeBlockwise4Bits(64); - } else if (128 == block_size) { - QuantizeBlockwise4Bits(128); - } else if (256 == block_size) { - QuantizeBlockwise4Bits(256); - } else { - ORT_NOT_IMPLEMENTED("only block size 16, 32, 64, 128, 256 are supported."); - } -} - -#undef QuantizeBlockwise4Bits - -template -void DequantizeBlockwise( - T* dst, // shape: [N, K] - const uint8_t* src, // shape: [N, block_per_K, block_blob_size] - const T* scale, // shape: [N, block_per_K] - const uint8_t* zero_points, // shape: [N, block_per_K] if bits > 4 else [N, (block_per_K + 1) / 2] - int32_t N, - int32_t K, - onnxruntime::concurrency::ThreadPool* thread_pool) { - int32_t block_per_K = (K + block_size - 1) / block_size; - int32_t task_count = N * block_per_K; - - const BlockwiseQuantBlock* src_blob = - reinterpret_cast*>(src); - - concurrency::ThreadPool::TryBatchParallelFor( - thread_pool, - task_count, - [&](ptrdiff_t task_idx) { - int32_t n = static_cast(task_idx / block_per_K); - int32_t k_block_idx = static_cast(task_idx % block_per_K); - int32_t k = k_block_idx * block_size; - const BlockwiseQuantBlock* blob_ptr = src_blob + task_idx; - size_t offset = SafeInt(n) * K + k; - if (nullptr != zero_points) { - if constexpr (bits > 4) { // zero point is stored with a byte - blob_ptr->dequant(dst + offset, scale[task_idx], zero_points[task_idx], k, K); - } else { // zero points is stored with 4bits - uint8_t zp = zero_points[task_idx / 2]; - zp = (task_idx & 1) ? (zp >> 4) : (zp & 0xf); - blob_ptr->dequant(dst + offset, scale[task_idx], zp, k, K); - } - } else { - blob_ptr->dequant(dst + offset, scale[task_idx], k, K); - } - }, - 0); -} - -#define DequantizeBlockwise4Bits(block_size) \ - DequantizeBlockwise(dst, src, scale, zero_points, N, K, thread_pool); - -template -void DequantizeBlockwise( - T* dst, // [N, K] - const uint8_t* src, // [N, block_per_K, block_blob_size] - const T* scale, // [N, block_per_K] - const uint8_t* zero_points, // [N, block_per_K] - int32_t block_size, - int32_t bits, - int32_t N, - int32_t K, - onnxruntime::concurrency::ThreadPool* thread_pool) { - ORT_ENFORCE(bits == 4, "only 4 bits is supported now"); - - if (16 == block_size) { - DequantizeBlockwise4Bits(16); - } else if (32 == block_size) { - DequantizeBlockwise4Bits(32); - } else if (64 == block_size) { - DequantizeBlockwise4Bits(64); - } else if (128 == block_size) { - DequantizeBlockwise4Bits(128); - } else if (256 == block_size) { - DequantizeBlockwise4Bits(256); - } else { - ORT_NOT_IMPLEMENTED("only block size 16, 32, 64, 128, 256 are supported."); - } -} - -#undef DequantizeBlockwise4Bits - -} // namespace contrib -} // namespace onnxruntime diff --git a/onnxruntime/contrib_ops/cpu/quantization/matmul_nbits.cc b/onnxruntime/contrib_ops/cpu/quantization/matmul_nbits.cc index 57aada94be39c..c72d811170a27 100644 --- a/onnxruntime/contrib_ops/cpu/quantization/matmul_nbits.cc +++ b/onnxruntime/contrib_ops/cpu/quantization/matmul_nbits.cc @@ -5,8 +5,7 @@ #include "core/framework/op_kernel.h" #include "core/providers/cpu/math/matmul_helper.h" #include "core/providers/common.h" -#include "dequantize_blockwise.h" -#include "core/mlas/inc/mlas.h" +#include "core/mlas/inc/mlas_q4.h" namespace onnxruntime { namespace contrib { @@ -18,6 +17,9 @@ class MatMulNBits final : public OpKernel { ORT_ENFORCE(Status::OK() == info.GetAttr("N", &N_)); ORT_ENFORCE(Status::OK() == info.GetAttr("block_size", &block_size_)); ORT_ENFORCE(Status::OK() == info.GetAttr("bits", &nbits_)); + ORT_ENFORCE(nbits_ == 4, + "Only 4b quantization is supported for MatMulNBits op," + " additional bits support is planned."); } Status Compute(OpKernelContext* context) const override; @@ -27,6 +29,7 @@ class MatMulNBits final : public OpKernel { int64_t N_; int64_t block_size_; int64_t nbits_; + bool column_wise_quant_{true}; }; Status MatMulNBits::Compute(OpKernelContext* ctx) const { @@ -46,15 +49,18 @@ Status MatMulNBits::Compute(OpKernelContext* ctx) const { auto status = ctx->GetTempSpaceAllocator(&allocator); ORT_RETURN_IF_ERROR(status); auto tmp_b_data_ptr = IAllocator::MakeUniquePtr(allocator, SafeInt(K_) * N_); - DequantizeBlockwise(tmp_b_data_ptr.get(), - b_data, - scales_data, - zero_points_data, - static_cast(block_size_), - static_cast(nbits_), - static_cast(N_), - static_cast(K_), - thread_pool); + + // dequantize b, only 4b quantization is supported for now + MlasDequantizeBlockwise( + tmp_b_data_ptr.get(), // dequantized output + b_data, // quantized input + scales_data, // quantization scales + zero_points_data, // quantization zero points + static_cast(block_size_), // quantization block size + column_wise_quant_, // columnwise quantization or row-wise + static_cast(K_), // number of rows in quantized input + static_cast(N_), // number of columns in quantized input + thread_pool); #if 0 // for debug auto tm_b_data_ptr_trans = IAllocator::MakeUniquePtr(allocator, SafeInt(K_) * N_); diff --git a/onnxruntime/contrib_ops/cuda/quantization/dequantize_blockwise.cu b/onnxruntime/contrib_ops/cuda/quantization/dequantize_blockwise.cu index 8c328d00b44d0..7921315ab52e1 100644 --- a/onnxruntime/contrib_ops/cuda/quantization/dequantize_blockwise.cu +++ b/onnxruntime/contrib_ops/cuda/quantization/dequantize_blockwise.cu @@ -18,6 +18,7 @@ namespace onnxruntime { namespace contrib { namespace cuda { + __device__ __forceinline__ void DequantizeEightElements(uint32_t values_quant, half scale, half zp, half* output) { half2 scale_half2 = {scale, scale}; half zp_adjust = -scale * __short2half_rn(zp); @@ -61,15 +62,19 @@ __global__ void Dequantize4BitsKernel( const T* scale_data, const uint8_t* zero_points, int block_size, + int blocks_per_K, int blocks_per_threadblock, int shift) { int block_id = blockIdx.x * blocks_per_threadblock + ((threadIdx.x * 8) >> shift); + int n_idx = block_id / blocks_per_K; + int kb_idx = block_id % blocks_per_K; int element_offset = block_id * block_size + ((threadIdx.x * 8) & ((1 << shift) - 1)); uint32_t quant_value = *(reinterpret_cast(quant_data + element_offset / 2)); T scale = *(scale_data + block_id); uint8_t zp = 8; if (zero_points) { - zp = (block_id & 0x01) ? (zero_points[block_id / 2] >> 4) : (zero_points[block_id / 2] & 0x0f); + zp = zero_points[n_idx * ((blocks_per_K + 1)/2) + kb_idx / 2]; + zp = (kb_idx & 0x01) ? (zp >> 4) : (zp & 0x0f); } output = output + element_offset; @@ -100,6 +105,7 @@ Status Dequantize4Bits( scales_data, zero_points, block_size, + blocks_per_K, blocks_per_threadblock, shift); @@ -126,6 +132,244 @@ template Status Dequantize4Bits( int block_size, cudaStream_t stream); + +/////////////////////////////////////////////////////////////////////////////// +// A more general block-wise dequantization implementation that supports +// different block sizes and block orientations (row-wise/column-wise). + +template < + int Row_, ///< rows of a matrix + int Column_ ///< columns of a matrix + > +struct Shape2D { + static int const kRow = Row_; ///< rows of a matrix + static int const kColumn = Column_; ///< columns of a matrix + static int const kCount = Row_ * Column_; ///< total number of elements in a matrix +}; + +/** + * @brief Blockwise quantization constants + * @tparam ElementT source data type, e.g. fp32/fp16 + * @tparam block_size number of elemenets quantized together + * @tparam qbits number of bits in each quantized element + * @tparam Columnwise true: elements in a block come from one single column + * false: elements in a block come from one single row + */ +template < + typename ElementT, + int32_t block_size, + int32_t qbits, + bool Columnwise> +struct BlkQuantTraits { + // number of qbit elements to pack into whole bytes + static constexpr int kPackSize = (qbits == 8) ? 1 : (qbits == 4) ? 2 : (qbits == 2) ? 4 : 0; + static_assert(kPackSize != 0, "Packing to whole bytes not supported for this qbits!"); + + using QuantBlk = std::conditional_t, Shape2D<1, block_size>>; + using ThreadBlk = Shape2D; +}; + +template < + typename ElementT, + int32_t block_size, + int32_t qbits, + bool Columnwise> +__global__ +void dequantizeThread(ElementT* dst, + const uint8_t* weights, + const ElementT* scales, + const uint8_t* zero_points, + int rows, + int columns, + int thrd_row_blks) { + using QuantBlk = typename BlkQuantTraits::QuantBlk; + using ThreadBlk = typename BlkQuantTraits::ThreadBlk; + + // !! 4b specific code + static_assert(qbits == 4, "Only 4b block quantization is supported!"); + + const auto block_idx = blockIdx.x * blockDim.x + threadIdx.x; + const auto row_blks = (rows + QuantBlk::kRow - 1) / QuantBlk::kRow; + + const auto meta_rows = (rows + QuantBlk::kRow - 1) / QuantBlk::kRow; + + // quantized matrix is stored in column major, packed by column + const auto q_rows = (meta_rows * QuantBlk::kRow * qbits + 7) / 8; + + int32_t r_blk_idx = static_cast(block_idx % thrd_row_blks); + int32_t c_blk_idx = static_cast(block_idx / thrd_row_blks); + + int32_t r = r_blk_idx * ThreadBlk::kRow; + int32_t c = c_blk_idx * ThreadBlk::kColumn; + + int32_t r_end = std::min(r + ThreadBlk::kRow, rows); + int32_t c_end = std::min(c + ThreadBlk::kColumn, columns); + + // for 4b quant, kPackSize = 2, so we have 2 scales and 2 offsets + const ElementT scale_buf[2] = { + scales[(c / QuantBlk::kColumn) * row_blks + r / QuantBlk::kRow], + ((r/QuantBlk::kRow) < (meta_rows - 1)) + ? scales[(c / QuantBlk::kColumn) * row_blks + r / QuantBlk::kRow + 1] + : static_cast(0.0f)}; + const uint8_t zp_pair = (zero_points == nullptr) + ? 0x88 + : zero_points[(c / QuantBlk::kColumn) * ((row_blks + 1) / 2) + (r / QuantBlk::kRow) / 2]; + const uint16_t zp_buf[2] = {(uint16_t)(zp_pair & 0x0f), (uint16_t)((zp_pair >> 4) & 0x0f)}; + const ElementT adjust_buf[2] = {(-scale_buf[0]) * static_cast(zp_buf[0]), + (-scale_buf[1]) * static_cast(zp_buf[1])}; + + for (int32_t j = c; j < c_end; ++j) { + const uint8_t* q_ptr = weights + j * q_rows; + for (int32_t i = r; i < (r_end - 1); i += 2) { + const auto scale0 = scale_buf[(i - r) / QuantBlk::kRow]; + const auto adjust0 = adjust_buf[(i - r) / QuantBlk::kRow]; + + const auto scale1 = scale_buf[(i + 1 - r) / QuantBlk::kRow];; + const auto adjust1 = adjust_buf[(i + 1 - r) / QuantBlk::kRow]; + + const auto vi = q_ptr[i / 2]; + + if constexpr (std::is_same::value) { + half2 scale_half2 = {scale0, scale1}; + half2 zp_adjust2 = {adjust0, adjust1}; + + half2 v = {__ushort2half_rn(vi & 0xF), __ushort2half_rn((vi >> 4) & 0xF)}; + half2 results = v * scale_half2 + zp_adjust2; + + dst[j * rows + i] = results.x; + dst[j * rows + (i + 1)] = results.y; + } else { + static_assert(std::is_same::value, "Only float and half are supported!"); + const uint8_t vi0 = vi & 0xf; + const uint8_t vi1 = vi >> 4; + dst[j * rows + i] = static_cast(vi0) * scale0 + adjust0;; + dst[j * rows + (i + 1)] = static_cast(vi1) * scale1 + adjust1; + } + } + + if ((r_end & 1) && (r_end > r)) { + const auto scale0 = scale_buf[(r_end - 1 - r) / QuantBlk::kRow]; + const auto adjust0 = adjust_buf[(r_end - 1 - r) / QuantBlk::kRow]; + + const auto vi = q_ptr[(r_end - 1) / 2]; + const uint8_t vi0 = vi & 0xf; + + dst[j * rows + (r_end - 1)] = static_cast(vi0) * scale0 + adjust0; + } + } +} + +template < + typename ElementT, + int32_t block_size, + int32_t qbits, + bool Columnwise> +static void dequantize(ElementT* dst, const uint8_t* weights, const ElementT* scales, + const uint8_t* zero_points, int32_t rows, int32_t columns, + cudaStream_t stream) { + using QuantBlk = typename BlkQuantTraits::QuantBlk; + using ThreadBlk = typename BlkQuantTraits::ThreadBlk; + + // Thread partitioning + const auto thrd_row_blks = (rows + ThreadBlk::kRow - 1) / ThreadBlk::kRow; + const auto thrd_col_blks = (columns + ThreadBlk::kColumn - 1) / ThreadBlk::kColumn; + const auto total_thrd_blks = thrd_row_blks * thrd_col_blks; + + const auto grids = (total_thrd_blks + GridDim::maxThreadsPerBlock - 1) / GridDim::maxThreadsPerBlock; + dequantizeThread<<>>( + dst, + weights, + scales, + zero_points, + rows, + columns, + thrd_row_blks); +} + + +template +Status +DequantizeBlockwise4b( + T* dst, + const uint8_t* src, + const T* scales, + const uint8_t* zero_points, + int block_size, + bool columnwise, + int rows, + int columns, + cudaStream_t stream) { + switch (block_size) { + case 16: + if (columnwise) { + dequantize(dst, src, scales, zero_points, rows, columns, stream); + } else { + dequantize(dst, src, scales, zero_points, rows, columns, stream); + } + return Status::OK(); + case 32: + if (columnwise) { + dequantize(dst, src, scales, zero_points, rows, columns, stream); + } else { + dequantize(dst, src, scales, zero_points, rows, columns, stream); + } + return Status::OK(); + case 64: + if (columnwise) { + dequantize(dst, src, scales, zero_points, rows, columns, stream); + } else { + dequantize(dst, src, scales, zero_points, rows, columns, stream); + } + return Status::OK(); + case 128: + if (columnwise) { + dequantize(dst, src, scales, zero_points, rows, + columns, stream); + } else { + dequantize(dst, src, scales, zero_points, + rows, columns, stream); + } + return Status::OK(); + case 256: + if (columnwise) { + dequantize(dst, src, scales, zero_points, rows, + columns, stream); + } else { + dequantize(dst, src, scales, zero_points, + rows, columns, stream); + } + return Status::OK(); + default: + // Only block size 16, 32, 64, 128, 256 are supported. + return Status(::onnxruntime::common::ONNXRUNTIME, ::onnxruntime::common::FAIL, + "Unsupported block size for blockwise quantization."); + } +} + +template +Status DequantizeBlockwise4b( + float* dst, + const uint8_t* src, + const float* scales, + const uint8_t* zero_points, + int block_size, + bool columnwise, + int rows, + int columns, + cudaStream_t stream); + +template +Status DequantizeBlockwise4b( + half* dst, + const uint8_t* src, + const half* scales, + const uint8_t* zero_points, + int block_size, + bool columnwise, + int rows, + int columns, + cudaStream_t stream); + } // namespace cuda } // namespace contrib } // namespace onnxruntime diff --git a/onnxruntime/contrib_ops/cuda/quantization/dequantize_blockwise.cuh b/onnxruntime/contrib_ops/cuda/quantization/dequantize_blockwise.cuh index 741ce1e735b42..f9c09c55fd893 100644 --- a/onnxruntime/contrib_ops/cuda/quantization/dequantize_blockwise.cuh +++ b/onnxruntime/contrib_ops/cuda/quantization/dequantize_blockwise.cuh @@ -18,6 +18,33 @@ Status Dequantize4Bits( int block_size, cudaStream_t stream); + +/** + * @brief Dequantize a block-wise quantized matrix, and store the result in a + * column major matrix for use in subsequent GEMM. This implementation supports + * columnwise and rowwise block orientation. + * @param[out] dst pointer to the dequantized matrix, column major: [columns, rows] + * @param[in] qelements pointer to the quantized elements, column major: [columns, rows] + * @param[in] scales pointer to the scales of quantized blocks, column major layout + * @param[in] zero_points pointer to the zero points of quantized blocks, packed column major + * scales + * @param[in] block_size size of the quantized block + * @param[in] columnwise whether the quantized matrix is columnwise or rowwise quantized + * @param[in] rows + * @param[in] columns + */ +template +Status DequantizeBlockwise4b( + T* dst, + const uint8_t* qelements, + const T* scales, + const uint8_t* zero_points, + int block_size, + bool columnwise, + int rows, + int columns, + cudaStream_t stream); + } // namespace cuda } // namespace contrib } // namespace onnxruntime diff --git a/onnxruntime/contrib_ops/cuda/quantization/matmul_nbits.cc b/onnxruntime/contrib_ops/cuda/quantization/matmul_nbits.cc index 14a8163fef500..5b0e61e197014 100644 --- a/onnxruntime/contrib_ops/cuda/quantization/matmul_nbits.cc +++ b/onnxruntime/contrib_ops/cuda/quantization/matmul_nbits.cc @@ -27,6 +27,9 @@ class MatMulNBits final : public CudaKernel { ORT_ENFORCE(Status::OK() == info.GetAttr("N", &N_)); ORT_ENFORCE(Status::OK() == info.GetAttr("block_size", &block_size_)); ORT_ENFORCE(Status::OK() == info.GetAttr("bits", &nbits_)); + ORT_ENFORCE(nbits_ == 4, + "Only 4b quantization is supported for MatMulNBits op," + " additional bits support is planned."); } Status ComputeInternal(OpKernelContext* context) const override; @@ -36,6 +39,7 @@ class MatMulNBits final : public CudaKernel { int64_t N_; int64_t block_size_; int64_t nbits_; + bool column_wise_quant_blk_{true}; }; template @@ -50,8 +54,6 @@ Status MatMulNBits::ComputeInternal(OpKernelContext* ctx) const { const auto* scales_data = scales->Data(); const auto* zero_points_data = zero_points == nullptr ? nullptr : zero_points->Data(); - ORT_ENFORCE(nbits_ == 4, "only 4 bits is supported now"); - typedef typename ToCudaType::MappedType CudaT; constexpr bool transa = false; @@ -81,14 +83,32 @@ Status MatMulNBits::ComputeInternal(OpKernelContext* ctx) const { int64_t K_padded = (K_ + block_size_ - 1) / block_size_ * block_size_; IAllocatorUniquePtr b_data_ptr = GetScratchBuffer(N_ * K_padded, ctx->GetComputeStream()); auto* b_data = b_data_ptr.get(); - ORT_RETURN_IF_ERROR(Dequantize4Bits(reinterpret_cast(b_data), - blob_data, - reinterpret_cast(scales_data), - zero_points_data, - SafeInt(K_padded), - SafeInt(N_), - SafeInt(block_size_), - static_cast(ctx->GetComputeStream()->GetHandle()))); + if (column_wise_quant_blk_) { + // column-wise block + ORT_RETURN_IF_ERROR(Dequantize4Bits( + reinterpret_cast(b_data), + blob_data, + reinterpret_cast(scales_data), + zero_points_data, + SafeInt(K_padded), + SafeInt(N_), + SafeInt(block_size_), + static_cast(ctx->GetComputeStream()->GetHandle()))); + } else { + // row-wise block + K_padded = K_; + + ORT_RETURN_IF_ERROR(DequantizeBlockwise4b( + reinterpret_cast(b_data), + blob_data, + reinterpret_cast(scales_data), + zero_points_data, + SafeInt(block_size_), + column_wise_quant_blk_, + SafeInt(K_), + SafeInt(N_), + static_cast(ctx->GetComputeStream()->GetHandle()))); + } #if 0 cudaStreamSynchronize(static_cast(ctx->GetComputeStream()->GetHandle())); T* b_data_cpu = new T[K_ * N_]; diff --git a/onnxruntime/contrib_ops/cuda/quantization/matmul_nbits.cu b/onnxruntime/contrib_ops/cuda/quantization/matmul_nbits.cu index 4c3c345076416..f2600a506285d 100644 --- a/onnxruntime/contrib_ops/cuda/quantization/matmul_nbits.cu +++ b/onnxruntime/contrib_ops/cuda/quantization/matmul_nbits.cu @@ -96,6 +96,9 @@ __global__ void MatMulFloatInt4Kernel( constexpr int k_per_iter = 256; int k_iter = k / k_per_iter; + // blocks_per_k is the number of scales and zero points on the k dim + const int b_zp_k = (blocks_per_K + 1)/ 2; + extern __shared__ char shared_buffer[]; // load scale to shared buffer @@ -105,30 +108,39 @@ __global__ void MatMulFloatInt4Kernel( for (int i = thread_id; i < kColsPerThreadBlock * blocks_per_K; i += kColsPerThreadBlock * kWarpSize) { b_scale_vec[i] = scales_data[offset + i]; } - for (int i = thread_id; i < kColsPerThreadBlock * blocks_per_K / 2; i += kColsPerThreadBlock * kWarpSize) { - b_zp_vec[i] = zero_points != nullptr ? zero_points[offset / 2 + i] : uint8_t(0x88); + + int zp_offset = n_block_id * kColsPerThreadBlock * b_zp_k; + for (int i = thread_id; i < kColsPerThreadBlock * b_zp_k; i += kColsPerThreadBlock * kWarpSize) { + b_zp_vec[i] = zero_points != nullptr ? zero_points[zp_offset + i] : uint8_t(0x88); } __syncthreads(); a_data += m_id * k; b_data_quant += n_id * blocks_per_K * (block_size / 2); + const int scale_col_offset = warp_id * blocks_per_K; + const int zp_col_offset = warp_id * b_zp_k; + float sum = 0.f; int k_id = 0; for (; k_id < (k & 0xffffff00); k_id += k_per_iter) { - uint32_t value = *(reinterpret_cast(b_data_quant + (k_id >> 1) + lane_id * 4)); - int32_t block_idx = warp_id * blocks_per_K + (k_id + lane_id * 8) / block_size; - T scale = b_scale_vec[block_idx]; - uint8_t zp = (block_idx & 0x01) ? (b_zp_vec[block_idx / 2] >> 4) : (b_zp_vec[block_idx / 2] & 0x0f); + const int t_k = k_id + (lane_id << 3); // k index for this thread + const int t_meta_k = t_k / block_size; // k index for this thread, points to the scale and zero point + uint32_t value = *(reinterpret_cast(b_data_quant + (t_k >> 1))); + T scale = b_scale_vec[scale_col_offset + t_meta_k]; + uint8_t zp = b_zp_vec[zp_col_offset + t_meta_k/2]; + zp = (t_meta_k & 0x01) ? (zp >> 4) : (zp & 0x0f); sum += AccumulateEightElements(value, scale, zp, a_data + k_id + (lane_id << 3)); } // handle reminder if (k_id + lane_id * 8 < k) { + const int t_k = k_id + (lane_id << 3); // k index for this thread + const int t_meta_k = t_k / block_size; // k index for this thread, points to the scale and zero point uint32_t value = *(reinterpret_cast(b_data_quant + k_iter * 128 + lane_id * 4)); - int32_t block_idx = warp_id * blocks_per_K + (k_id + lane_id * 8) / block_size; - T scale = b_scale_vec[block_idx]; - uint8_t zp = (block_idx & 0x01) ? (b_zp_vec[block_idx / 2] >> 4) : (b_zp_vec[block_idx / 2] & 0x0f); + T scale = b_scale_vec[scale_col_offset + t_meta_k]; + uint8_t zp = b_zp_vec[zp_col_offset + t_meta_k/2]; + zp = (t_meta_k & 0x01) ? (zp >> 4) : (zp & 0x0f); sum += AccumulateEightElements(value, scale, zp, a_data + k_id + (lane_id << 3)); } diff --git a/onnxruntime/core/framework/kernel_registry_manager.cc b/onnxruntime/core/framework/kernel_registry_manager.cc index c4eef5b27c1bb..b2ef853119588 100644 --- a/onnxruntime/core/framework/kernel_registry_manager.cc +++ b/onnxruntime/core/framework/kernel_registry_manager.cc @@ -62,8 +62,13 @@ Status KernelRegistryManager::SearchKernelRegistry(const Node& node, auto create_error_message = [&node, &status](const std::string& prefix) { std::ostringstream errormsg; - errormsg << prefix << node.OpType() << "(" << node.SinceVersion() << ")"; - errormsg << " (node:'" << node.Name() << "' ep:'" << node.GetExecutionProviderType() << "'). "; + errormsg << prefix; + const auto& domain = node.Domain(); + if (!domain.empty()) { + errormsg << domain << "."; + } + errormsg << node.OpType() << "(" << node.SinceVersion() << ")" + << " (node:'" << node.Name() << "' ep:'" << node.GetExecutionProviderType() << "'). "; if (!status.IsOK()) errormsg << status.ErrorMessage(); diff --git a/onnxruntime/core/graph/contrib_ops/internal_nhwc_onnx_schemas.cc b/onnxruntime/core/graph/contrib_ops/internal_nhwc_onnx_schemas.cc index d3fc5873cb274..03ad95260c0ad 100644 --- a/onnxruntime/core/graph/contrib_ops/internal_nhwc_onnx_schemas.cc +++ b/onnxruntime/core/graph/contrib_ops/internal_nhwc_onnx_schemas.cc @@ -90,12 +90,16 @@ void RegisterNHWCSchemaWithActivation(const RegistrationFunc& f, ::ONNX_NAMESPAC void OpSet_Internal_NHWC_ONNX::ForEachSchema(const std::function& fn) { // if the operator may be fused with an activation, use the WITH_ACTIVATION variant to add optional attributes // for the activation parameters. - // For now we only register operators from opset 11 on. Models can easily have their opset updated using ONNX tools + // We mainly register operators from opset 11 on . Models can easily have their opset updated using ONNX tools // so supporting older opsets is unnecessary. + // Older opsets are included on a per-operator basis as needed. // NOTE: This should be in sync with GetLayoutSensitiveOps in // /onnxruntime/core/optimizer/transpose_optimization/transpose_optimizer.cc + REGISTER_NHWC_SCHEMA_WITH_ACTIVATION(fn, AveragePool, 7); + REGISTER_NHWC_SCHEMA_WITH_ACTIVATION(fn, AveragePool, 10); REGISTER_NHWC_SCHEMA_WITH_ACTIVATION(fn, AveragePool, 11); + REGISTER_NHWC_SCHEMA_WITH_ACTIVATION(fn, AveragePool, 19); REGISTER_NHWC_SCHEMA_WITH_ACTIVATION(fn, BatchNormalization, 9); REGISTER_NHWC_SCHEMA_WITH_ACTIVATION(fn, BatchNormalization, 14); @@ -106,16 +110,18 @@ void OpSet_Internal_NHWC_ONNX::ForEachSchema(const std::functionsecond; + } + } + auto domain_map = allow_official_onnx_release_only_final ? schema_registry->GetLastReleasedOpsetVersions(false) : schema_registry->GetLatestOpsetVersions(false); diff --git a/onnxruntime/core/mlas/inc/mlas_q4.h b/onnxruntime/core/mlas/inc/mlas_q4.h index f3bc2a2434ab3..7c7b729117e4a 100644 --- a/onnxruntime/core/mlas/inc/mlas_q4.h +++ b/onnxruntime/core/mlas/inc/mlas_q4.h @@ -39,7 +39,7 @@ typedef enum { * @brief Computes the number of bytes required to pack and int4-quantize * a weight matrix * @param QType type of block quantization - * @param N the number of columns of matrix B. + * @param N the number of columns of matrix B. * @param K the number of rows of matrix B. * @return size of the packing buffer, 0 if the operation is not yet supported. */ @@ -53,11 +53,11 @@ MlasQ4GemmPackBSize( /** * @brief Prepack and Quantize fp32 weight tensor to int4 blocks - * + * * @param QType type of block quantization * @param PackedBuf destination buffer * @param FpData the pointer to fp32 matrix - * @param N the number of columns of matrix B. + * @param N the number of columns of matrix B. * @param K the number of rows of matrix B. * @param ldb leading dimension of B */ @@ -257,14 +257,14 @@ MlasBlockwiseQuantMetaShape( * matrix shape [rows, columns], compute the shape of the * quantized matrix [q_rows, q_cols]. The quantized matrix * is in column major layout, with bits packed on the column. - * - * @tparam T - * @param block_size - * @param columnwise - * @param rows - * @param columns - * @param q_rows - * @param q_cols + * + * @tparam T + * @param block_size + * @param columnwise + * @param rows + * @param columns + * @param q_rows + * @param q_cols */ template void @@ -283,21 +283,22 @@ MlasBlockwiseQuantizedShape( * parameters (scales, zero points) are packed into separate matrices * all in column major layout for faster access during subsequent matrix * multiplication. - * + * * @tparam ElementT type of the input matrix element, usually floating point - * + * @tparam qbits number of bits used for quantization, 4 for int4 + * * @param dst points to the quantized matrix, shape [rows, columns] column major - * @param scales points to the scales matrix, column major + * @param scales points to the scales matrix, column major * @param zero_points points to the zero_points matrix, column major * @param src points to the floating point matrix, to be quantized, row major shape [rows, columns] * @param block_size size of the block to quantize, elements from the same block share the same scale and zero point * @param columnwise true when elements in a block are from the same column, false when elements in a block are from the same row - * @param rows - * @param columns - * @param leading_dimension - * @param thread_pool + * @param rows + * @param columns + * @param leading_dimension + * @param thread_pool */ -template +template void MlasQuantizeBlockwise( uint8_t* dst, @@ -318,19 +319,21 @@ MlasQuantizeBlockwise( * parameters (scales, zero points) are from separate matrices packed * in column major layout. Output is a floating point matrix in column * major layout for faster access during subsequent matrix multiplication. - * + * * @tparam ElementT type of the dequantized matrix element, usually floating point + * @tparam qbits number of bits used for quantization, 4 for int4 + * * @param dst points to dequantized matrix shape [rows, columns] column major * @param src points to quantized matrix, column major * @param scales points to quantization scales, column major * @param zero_points points to quantization zero points, column major * @param block_size size of the block to quantize, elements from the same block share the same scale and zero point * @param columnwise true when elements in a block are from the same column, false when elements in a block are from the same row - * @param rows - * @param columns - * @param thread_pool + * @param rows + * @param columns + * @param thread_pool */ -template +template void MlasDequantizeBlockwise( ElementT* dst, diff --git a/onnxruntime/core/mlas/lib/q4_dq.cpp b/onnxruntime/core/mlas/lib/q4_dq.cpp index 24a2212ba0714..fbd1030de8ab7 100644 --- a/onnxruntime/core/mlas/lib/q4_dq.cpp +++ b/onnxruntime/core/mlas/lib/q4_dq.cpp @@ -364,7 +364,7 @@ range2scalezp(float min, float max, ScaleT& scale, uint8_t& zp) } else { zp = (uint8_t)roundf(zero_point_fp); } - scale = static_cast(scale_f); + scale = ScaleT(scale_f); } template @@ -377,7 +377,7 @@ range2scale(float min, float max, ScaleT& scale) max = fabsf(max) > fabsf(min) ? max : min; - scale = static_cast(max / mid_fp); + scale = ScaleT(max / mid_fp); }; @@ -773,7 +773,7 @@ MlasBlockwiseQuantizedShape( ); -template +template void MlasQuantizeBlockwise( uint8_t* dst, @@ -791,50 +791,50 @@ MlasQuantizeBlockwise( switch (block_size) { case 16: if (columnwise) { - BlockwiseQuantizer::quantizeAndTranspose( + BlockwiseQuantizer::quantizeAndTranspose( dst, scales, zero_points, src, rows, columns, leading_dimension, thread_pool); } else { - BlockwiseQuantizer::quantizeAndTranspose( + BlockwiseQuantizer::quantizeAndTranspose( dst, scales, zero_points, src, rows, columns, leading_dimension, thread_pool); } break; case 32: if (columnwise) { - BlockwiseQuantizer::quantizeAndTranspose( + BlockwiseQuantizer::quantizeAndTranspose( dst, scales, zero_points, src, rows, columns, leading_dimension, thread_pool); } else { - BlockwiseQuantizer::quantizeAndTranspose( + BlockwiseQuantizer::quantizeAndTranspose( dst, scales, zero_points, src, rows, columns, leading_dimension, thread_pool); } break; case 64: if (columnwise) { - BlockwiseQuantizer::quantizeAndTranspose( + BlockwiseQuantizer::quantizeAndTranspose( dst, scales, zero_points, src, rows, columns, leading_dimension, thread_pool); } else { - BlockwiseQuantizer::quantizeAndTranspose( + BlockwiseQuantizer::quantizeAndTranspose( dst, scales, zero_points, src, rows, columns, leading_dimension, thread_pool); } break; case 128: if (columnwise) { - BlockwiseQuantizer::quantizeAndTranspose( + BlockwiseQuantizer::quantizeAndTranspose( dst, scales, zero_points, src, rows, columns, leading_dimension, thread_pool); } else { - BlockwiseQuantizer::quantizeAndTranspose( + BlockwiseQuantizer::quantizeAndTranspose( dst, scales, zero_points, src, rows, columns, leading_dimension, thread_pool); } break; case 256: if (columnwise) { - BlockwiseQuantizer::quantizeAndTranspose( + BlockwiseQuantizer::quantizeAndTranspose( dst, scales, zero_points, src, rows, columns, leading_dimension, thread_pool); } else { - BlockwiseQuantizer::quantizeAndTranspose( + BlockwiseQuantizer::quantizeAndTranspose( dst, scales, zero_points, src, rows, columns, leading_dimension, thread_pool); } break; @@ -847,7 +847,7 @@ MlasQuantizeBlockwise( template void -MlasQuantizeBlockwise( +MlasQuantizeBlockwise( uint8_t* dst, float* scales, uint8_t* zero_points, @@ -860,8 +860,23 @@ MlasQuantizeBlockwise( MLAS_THREADPOOL* thread_pool ); +template +void +MlasQuantizeBlockwise( + uint8_t* dst, + MLAS_FP16* scales, + uint8_t* zero_points, + const MLAS_FP16* src, + int block_size, + bool columnwise, + int rows, + int columns, + int leading_dimension, + MLAS_THREADPOOL* thread_pool + ); + -template +template void MlasDequantizeBlockwise( T* dst, @@ -878,46 +893,46 @@ MlasDequantizeBlockwise( switch (block_size) { case 16: if (columnwise) { - BlockwiseQuantizer::dequantize(dst, src, scales, zero_points, rows, + BlockwiseQuantizer::dequantize(dst, src, scales, zero_points, rows, columns, thread_pool); } else { - BlockwiseQuantizer::dequantize(dst, src, scales, zero_points, rows, + BlockwiseQuantizer::dequantize(dst, src, scales, zero_points, rows, columns, thread_pool); } break; case 32: if (columnwise) { - BlockwiseQuantizer::dequantize(dst, src, scales, zero_points, rows, + BlockwiseQuantizer::dequantize(dst, src, scales, zero_points, rows, columns, thread_pool); } else { - BlockwiseQuantizer::dequantize(dst, src, scales, zero_points, rows, + BlockwiseQuantizer::dequantize(dst, src, scales, zero_points, rows, columns, thread_pool); } break; case 64: if (columnwise) { - BlockwiseQuantizer::dequantize(dst, src, scales, zero_points, rows, + BlockwiseQuantizer::dequantize(dst, src, scales, zero_points, rows, columns, thread_pool); } else { - BlockwiseQuantizer::dequantize(dst, src, scales, zero_points, rows, + BlockwiseQuantizer::dequantize(dst, src, scales, zero_points, rows, columns, thread_pool); } break; case 128: if (columnwise) { - BlockwiseQuantizer::dequantize(dst, src, scales, zero_points, rows, + BlockwiseQuantizer::dequantize(dst, src, scales, zero_points, rows, columns, thread_pool); } else { - BlockwiseQuantizer::dequantize(dst, src, scales, zero_points, + BlockwiseQuantizer::dequantize(dst, src, scales, zero_points, rows, columns, thread_pool); } break; case 256: if (columnwise) { - BlockwiseQuantizer::dequantize(dst, src, scales, zero_points, rows, + BlockwiseQuantizer::dequantize(dst, src, scales, zero_points, rows, columns, thread_pool); } else { - BlockwiseQuantizer::dequantize(dst, src, scales, zero_points, + BlockwiseQuantizer::dequantize(dst, src, scales, zero_points, rows, columns, thread_pool); } break; @@ -929,7 +944,7 @@ MlasDequantizeBlockwise( template void -MlasDequantizeBlockwise( +MlasDequantizeBlockwise( float* dst, const uint8_t* src, const float* scales, diff --git a/onnxruntime/core/optimizer/graph_transformer_utils.cc b/onnxruntime/core/optimizer/graph_transformer_utils.cc index 86b126f2c7c31..c1397e92d9d26 100644 --- a/onnxruntime/core/optimizer/graph_transformer_utils.cc +++ b/onnxruntime/core/optimizer/graph_transformer_utils.cc @@ -51,6 +51,7 @@ #include "core/optimizer/matmul_scale_fusion.h" #include "core/optimizer/matmul_transpose_fusion.h" #include "core/optimizer/matmul_bn_fusion.h" +#include "core/optimizer/pad_fusion.h" #include "core/optimizer/nchwc_transformer.h" #include "core/optimizer/noop_elimination.h" #include "core/optimizer/not_where_fusion.h" @@ -128,6 +129,7 @@ InlinedVector> GenerateRewriteRules( rules.push_back(std::make_unique()); rules.push_back(std::make_unique()); rules.push_back(std::make_unique()); + rules.push_back(std::make_unique()); rules.push_back(std::make_unique()); rules.push_back(std::make_unique()); rules.push_back(std::make_unique()); diff --git a/onnxruntime/core/optimizer/layout_transformation/layout_transformation.cc b/onnxruntime/core/optimizer/layout_transformation/layout_transformation.cc index 290380cabb036..4505d4afdf1e0 100644 --- a/onnxruntime/core/optimizer/layout_transformation/layout_transformation.cc +++ b/onnxruntime/core/optimizer/layout_transformation/layout_transformation.cc @@ -66,17 +66,6 @@ bool ConvertNodeLayout(const api::NodeRef& node) { const auto& layout_sensitive_ops = GetORTLayoutSensitiveOps(); // handle special cases -#if defined(USE_XNNPACK) - if (node.GetExecutionProviderType() == kXnnpackExecutionProvider) { - if (node.OpType() == "Resize") { - // XNNPACK supports NCHW and NHWC for Resize so we don't need to use the internal NHWC domain and wrap the Resize - // with Transpose nodes. EPAwareHandleResize will allow an NCHW <-> NHWC Transpose to be pushed through - // the Resize during transpose optimization. - return false; - } - } -#endif - #if defined(USE_JSEP) // TODO(fs-eire): Remove special case handing of JSEP once NHWC Resize implementation is fixed if (node.GetExecutionProviderType() == kJsExecutionProvider) { diff --git a/onnxruntime/core/optimizer/pad_fusion.cc b/onnxruntime/core/optimizer/pad_fusion.cc new file mode 100644 index 0000000000000..b25e7618802dd --- /dev/null +++ b/onnxruntime/core/optimizer/pad_fusion.cc @@ -0,0 +1,128 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include "core/optimizer/pad_fusion.h" +#include "core/graph/graph_utils.h" +#include "core/optimizer/initializer.h" +#include "core/optimizer/utils.h" + +namespace onnxruntime { + +/* + * It matches following pattern: + * Pad + * | + * Conv/MaxPool + */ +bool PadFusion::SatisfyCondition(const Graph& graph, const Node& node, const logging::Logger&) const { + // if Pad has input axis, don't fuse it. + if (!graph_utils::IsSupportedOptypeVersionAndDomain(node, "Pad", {1, 2, 11, 13, 18, 19}) || + node.GetOutputEdgesCount() != 1 || + node.InputDefs().size() > 3) { + return false; + } + + if (graph.NodeProducesGraphOutput(node)) { + return false; + } + + const Node& child_node = *node.OutputNodesBegin(); + if (!graph_utils::IsSupportedOptypeVersionAndDomain(child_node, "Conv", {1, 11}) && + !graph_utils::IsSupportedOptypeVersionAndDomain(child_node, "MaxPool", {1, 8, 10, 11, 12})) { + return false; + } + + // Don't fuse if MaxPool has optional output indices tensor because output indices tensor + // does not incorporate pad values. Basically if we allow the fusion, then dimension values + // of input tensor < dimension values of input tensor without fusion. + // This will cause the range of values for output indices tensor to be less than what it + // should have been. + + if (child_node.OutputDefs().size() > 1) { + return false; + } + + // conv or maxpool node must use explicit padding to perform this fusion. + if (child_node.GetAttributes().find("auto_pad") != child_node.GetAttributes().end() && + child_node.GetAttributes().at("auto_pad").s() != "NOTSET") { + return false; + } + + const NodeAttributes& pad_attributes = node.GetAttributes(); + if (pad_attributes.find("mode") != pad_attributes.end() && + pad_attributes.at("mode").s() != "constant") { + return false; + } + + // Since opset 11, and moved to inputs. + // Both of these should be initializer because we have to verify the values. + if (node.SinceVersion() >= 11) { + if (!graph_utils::NodeArgIsConstant(graph, *node.InputDefs()[1]) || + (node.InputDefs().size() > 2 && !graph_utils::NodeArgIsConstant(graph, *node.InputDefs()[2]))) { + return false; + } + + // constant_value should be zero because Conv and MaxPool allow only 0 as padding value. + if (node.InputDefs().size() > 2) { + const auto* pad_constant_value_proto = graph_utils::GetConstantInitializer(graph, node.InputDefs()[2]->Name()); + Initializer pad_constant_value{*pad_constant_value_proto, graph.ModelPath()}; + if (std::any_of(pad_constant_value.DataAsByteSpan().begin(), pad_constant_value.DataAsByteSpan().end(), [](const uint8_t byte) { return byte != 0; })) { + return false; + } + } + } else { + if (pad_attributes.find("value") != pad_attributes.end() && + pad_attributes.at("value").f() != 0.0) { + return false; + } + } + + return true; +} + +/* + * - For 1st two dimension Pads array's value should be zero and for rest of them values should >= 0 + */ +Status PadFusion::Apply(Graph& graph, Node& pad_node, RewriteRuleEffect& rule_effect, const logging::Logger&) const { + std::vector pads_values; + + if (pad_node.SinceVersion() >= 11) { + const auto* pads_proto = graph_utils::GetConstantInitializer(graph, pad_node.InputDefs()[1]->Name()); + Initializer pads{*pads_proto, graph.ModelPath()}; + pads_values.assign(pads.DataAsSpan().begin(), pads.DataAsSpan().end()); + } else { + pads_values.assign(pad_node.GetAttributes().at("pads").ints().begin(), pad_node.GetAttributes().at("pads").ints().end()); + } + + assert(static_cast(pads_values.size()) == (2 * static_cast(pad_node.InputDefs()[0]->Shape()->dim_size()))); + + uint32_t pads_size = static_cast(pads_values.size()); + // check if padding is applied only on feature dims + if (pads_values[0] != 0 || pads_values[1] != 0 || pads_values[pads_size / 2] != 0 || + pads_values[pads_size / 2 + 1] != 0) { + return Status::OK(); + } + + // check if padding is only positive + if (std::any_of(pads_values.begin(), pads_values.end(), [](int64_t value) { return value < 0; })) { + return Status::OK(); + } + + Node& child_node = *graph.GetNode(pad_node.OutputNodesBegin()->Index()); + auto child_pads = child_node.GetMutableAttributes()["pads"].mutable_ints(); + uint32_t child_pads_size = static_cast(child_pads->size()); + + for (uint32_t pads_index = 2, child_index = 0; pads_index < pads_size / 2; pads_index++, child_index++) { + child_pads->Set(child_index, child_pads->Get(child_index) + pads_values[pads_index]); + uint32_t mirrored_child_index = child_index + (child_pads_size / 2); + uint32_t mirrored_pad_index = pads_index + (pads_size / 2); + child_pads->Set(mirrored_child_index, child_pads->Get(mirrored_child_index) + pads_values[mirrored_pad_index]); + } + + graph_utils::RemoveNodeOutputEdges(graph, pad_node); + graph_utils::ReplaceNodeInput(child_node, 0, *pad_node.MutableInputDefs()[0]); + graph.RemoveNode(pad_node.Index()); + rule_effect = RewriteRuleEffect::kRemovedCurrentNode; + return Status::OK(); +} +} // namespace onnxruntime \ No newline at end of file diff --git a/onnxruntime/core/optimizer/pad_fusion.h b/onnxruntime/core/optimizer/pad_fusion.h new file mode 100644 index 0000000000000..a1b6978a83d1e --- /dev/null +++ b/onnxruntime/core/optimizer/pad_fusion.h @@ -0,0 +1,27 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#pragma once + +#include "core/optimizer/rewrite_rule.h" + +namespace onnxruntime { +/* + * This fusion submerges a Pad operator to it's child + * Conv or MaxPool operator, if and only if PadFusion::SatisfyCondition() + * is true. + */ +class PadFusion : public RewriteRule { + public: + PadFusion() : RewriteRule("Pad_Fusion") {} + + std::vector TargetOpTypes() const noexcept override { + return {"Pad"}; + } + + private: + bool SatisfyCondition(const Graph& graph, const Node& node, const logging::Logger& logger) const override; + + Status Apply(Graph& graph, Node& matmul_node, RewriteRuleEffect& rule_effect, const logging::Logger& logger) const override; +}; +} // namespace onnxruntime \ No newline at end of file diff --git a/onnxruntime/core/optimizer/transpose_optimization/ort_transpose_optimization.cc b/onnxruntime/core/optimizer/transpose_optimization/ort_transpose_optimization.cc index f4f3505128737..8eaac3d34c3af 100644 --- a/onnxruntime/core/optimizer/transpose_optimization/ort_transpose_optimization.cc +++ b/onnxruntime/core/optimizer/transpose_optimization/ort_transpose_optimization.cc @@ -17,7 +17,7 @@ static bool EPAwareHandleResize(HandlerArgs& args) { // layout. Due to that, only push a Transpose through a Resize once it is assigned and we know it's being handled // by an EP that supports multiple layouts. Currently that's the CPU and XNNPACK EPs. const auto ep_type = args.node.GetExecutionProviderType(); - if (ep_type == kCpuExecutionProvider || ep_type == kXnnpackExecutionProvider) { + if (ep_type == kCpuExecutionProvider) { // allow NCHW <-> NHWC for now. not clear any other sort of transpose has a valid usage in a real model int64_t rank_int = gsl::narrow_cast(args.perm.size()); if (rank_int == 4) { diff --git a/onnxruntime/core/providers/cpu/generator/random.cc b/onnxruntime/core/providers/cpu/generator/random.cc index b63c0d2161ad5..dfa27f1f44d5a 100644 --- a/onnxruntime/core/providers/cpu/generator/random.cc +++ b/onnxruntime/core/providers/cpu/generator/random.cc @@ -428,4 +428,14 @@ template Status MultinomialComputeShared(AllocatorPtr& alloc, std::default_random_engine& generator, Tensor& Y); +#if !defined(DISABLE_CONTRIB_OPS) +// used by onnxruntime/contrib_ops/cpu/transformers/sampling_cpu_helper.h +template Status MultinomialComputeShared(AllocatorPtr& alloc, + const Tensor& X, + const int64_t batch_size, + const int64_t num_classes, + const int64_t num_samples, + std::default_random_engine& generator, + Tensor& Y); +#endif } // namespace onnxruntime diff --git a/onnxruntime/core/providers/cpu/math/element_wise_ops.cc b/onnxruntime/core/providers/cpu/math/element_wise_ops.cc index 3192c8573c5c0..1d524a90302e7 100644 --- a/onnxruntime/core/providers/cpu/math/element_wise_ops.cc +++ b/onnxruntime/core/providers/cpu/math/element_wise_ops.cc @@ -967,7 +967,7 @@ Status Xor::Compute(OpKernelContext* context) const { }, [](BroadcastHelper& per_iter_bh) { per_iter_bh.OutputEigen() = - per_iter_bh.EigenInput0().array() ^ per_iter_bh.EigenInput1().array(); + per_iter_bh.EigenInput0().array() != per_iter_bh.EigenInput1().array(); }}; UntypedBroadcastTwo(*context, funcs, 1.0); diff --git a/onnxruntime/core/providers/cpu/tensor/upsamplebase.h b/onnxruntime/core/providers/cpu/tensor/upsamplebase.h index 99522cdf0759a..0b3ce6f477843 100644 --- a/onnxruntime/core/providers/cpu/tensor/upsamplebase.h +++ b/onnxruntime/core/providers/cpu/tensor/upsamplebase.h @@ -352,7 +352,7 @@ class UpsampleBase { (scales.size() == 4 && scales[0] == 1 && scales[3] == 1) || scales.size() == 3 || (scales.size() == 5 && scales[0] == 1 && scales[1] == 1), - "'Linear' mode only support:\n" + "'Linear' mode only supports:\n" " * 2-D inputs or\n" " * 3-D inputs ('Bilinear', 'Trilinear') or\n" " * 4-D inputs with the corresponding outermost 2 scale values being 1" diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlRuntimeFusedGraphKernel.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlRuntimeFusedGraphKernel.cpp index 1db22ac92e527..5c7b7bff1e370 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlRuntimeFusedGraphKernel.cpp +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/DmlRuntimeFusedGraphKernel.cpp @@ -93,7 +93,7 @@ namespace Dml onnxruntime::Status Compute(onnxruntime::OpKernelContext* kernelContext) const override { - ORT_THROW_HR_IF(E_UNEXPECTED, m_subgraphInputs.size() != kernelContext->InputCount()); + ORT_THROW_HR_IF(E_UNEXPECTED, static_cast(m_subgraphInputs.size()) != kernelContext->InputCount()); bool recompileNeeded = m_compiledExecutionPlanOperator == nullptr; @@ -159,7 +159,7 @@ namespace Dml if (iter != m_inferredInputShapes.end()) { auto tensorShape = *nodeArg->Shape(); - ORT_THROW_HR_IF(E_UNEXPECTED, tensorShape.dim_size() != iter->second.NumDimensions()); + ORT_THROW_HR_IF(E_UNEXPECTED, tensorShape.dim_size() != static_cast(iter->second.NumDimensions())); for (int i = 0; i < tensorShape.dim_size(); ++i) { diff --git a/onnxruntime/core/providers/js/js_execution_provider.cc b/onnxruntime/core/providers/js/js_execution_provider.cc index 8dbd552dd0550..798244d7cb75b 100644 --- a/onnxruntime/core/providers/js/js_execution_provider.cc +++ b/onnxruntime/core/providers/js/js_execution_provider.cc @@ -238,18 +238,40 @@ class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 16, Whe class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, 12, Transpose); class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 13, Transpose); +class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, 10, Conv); +class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 11, Conv); +class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSInternalNHWCDomain, 1, 10, Conv); class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSInternalNHWCDomain, 11, Conv); + +class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, 10, ConvTranspose); +class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 11, ConvTranspose); +class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSInternalNHWCDomain, 1, 10, ConvTranspose); class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSInternalNHWCDomain, 11, ConvTranspose); + +class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, 7, MaxPool); +class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 8, 9, MaxPool); +class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 10, 10, MaxPool); +class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 11, 11, MaxPool); +class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 12, MaxPool); +class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSInternalNHWCDomain, 1, 7, MaxPool); +class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSInternalNHWCDomain, 8, 9, MaxPool); +class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSInternalNHWCDomain, 10, 10, MaxPool); class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSInternalNHWCDomain, 11, 11, MaxPool); class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSInternalNHWCDomain, 12, MaxPool); + +class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 7, 9, AveragePool); +class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 10, 10, AveragePool); +class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 11, AveragePool); +class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSInternalNHWCDomain, 7, 9, AveragePool); +class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSInternalNHWCDomain, 10, 10, AveragePool); class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSInternalNHWCDomain, 11, AveragePool); + +class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, GlobalAveragePool); class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSInternalNHWCDomain, 1, GlobalAveragePool); + +class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, GlobalMaxPool); class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSInternalNHWCDomain, 1, GlobalMaxPool); -class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, 10, Conv); -class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 11, Conv); -class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, 10, ConvTranspose); -class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 11, ConvTranspose); class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 7, 8, Gemm); class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 9, 10, Gemm); class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 11, 12, Gemm); @@ -257,17 +279,6 @@ class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 13, Gem class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, 12, MatMul); class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 13, MatMul); -class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 7, 9, AveragePool); -class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 10, 10, AveragePool); -class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 11, AveragePool); -class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, GlobalAveragePool); -class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, 7, MaxPool); -class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 8, 9, MaxPool); -class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 10, 10, MaxPool); -class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 11, 11, MaxPool); -class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 12, MaxPool); -class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, GlobalMaxPool); - class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, 10, float, ArgMax); class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 11, 12, float, ArgMax); class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 13, float, ArgMax); @@ -291,11 +302,17 @@ class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomai class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 18, Split); class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 8, 12, Expand); class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 13, Expand); + class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 10, 10, Resize); class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 11, 12, Resize); class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 13, 17, Resize); class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 18, 18, Resize); class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 19, Resize); +class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSInternalNHWCDomain, 10, 10, Resize); +class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSInternalNHWCDomain, 11, 12, Resize); +class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSInternalNHWCDomain, 13, 17, Resize); +class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSInternalNHWCDomain, 18, 18, Resize); +class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSInternalNHWCDomain, 19, Resize); class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, 10, Gather); class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 11, 12, Gather); @@ -304,11 +321,6 @@ class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 13, Gat class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 11, 12, GatherElements); class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 13, GatherElements); -class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSInternalNHWCDomain, 11, 12, Resize); -class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSInternalNHWCDomain, 13, 17, Resize); -class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSInternalNHWCDomain, 18, 18, Resize); -class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSInternalNHWCDomain, 19, Resize); - class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, 9, Slice); class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 10, 10, Slice); class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 11, 12, Slice); @@ -322,8 +334,9 @@ class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomai class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 13, Tile); class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 17, LayerNormalization); -class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSInternalNHWCDomain, 6, InstanceNormalization); + class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 6, InstanceNormalization); +class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSInternalNHWCDomain, 6, InstanceNormalization); class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 11, Range); @@ -508,18 +521,40 @@ std::unique_ptr RegisterKernels() { BuildKernelCreateInfo, BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, BuildKernelCreateInfo, + + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, BuildKernelCreateInfo, + + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, + + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, BuildKernelCreateInfo, + + BuildKernelCreateInfo, BuildKernelCreateInfo, + + BuildKernelCreateInfo, BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, @@ -527,17 +562,6 @@ std::unique_ptr RegisterKernels() { BuildKernelCreateInfo, BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, @@ -575,7 +599,7 @@ std::unique_ptr RegisterKernels() { BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, - + BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, @@ -594,8 +618,9 @@ std::unique_ptr RegisterKernels() { BuildKernelCreateInfo, BuildKernelCreateInfo, - BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, BuildKernelCreateInfo, diff --git a/onnxruntime/core/providers/js/operators/conv.cc b/onnxruntime/core/providers/js/operators/conv.cc index 68336c996a863..474fd260880ce 100644 --- a/onnxruntime/core/providers/js/operators/conv.cc +++ b/onnxruntime/core/providers/js/operators/conv.cc @@ -25,6 +25,13 @@ ONNX_OPERATOR_KERNEL_EX( (*KernelDefBuilder::Create()).TypeConstraint("T", JsepSupportedFloatTypes()), Conv); +ONNX_OPERATOR_VERSIONED_KERNEL_EX( + Conv, + kMSInternalNHWCDomain, + 1, 10, + kJsExecutionProvider, + (*KernelDefBuilder::Create()).TypeConstraint("T", JsepSupportedFloatTypes()), + Conv); ONNX_OPERATOR_VERSIONED_KERNEL_EX( Conv, kOnnxDomain, diff --git a/onnxruntime/core/providers/js/operators/conv_transpose.cc b/onnxruntime/core/providers/js/operators/conv_transpose.cc index f7f0ab22b7006..2aaf438f30d4d 100644 --- a/onnxruntime/core/providers/js/operators/conv_transpose.cc +++ b/onnxruntime/core/providers/js/operators/conv_transpose.cc @@ -24,6 +24,13 @@ ONNX_OPERATOR_KERNEL_EX( (*KernelDefBuilder::Create()).TypeConstraint("T", JsepSupportedFloatTypes()), ConvTranspose); +ONNX_OPERATOR_VERSIONED_KERNEL_EX( + ConvTranspose, + kMSInternalNHWCDomain, + 1, 10, + kJsExecutionProvider, + (*KernelDefBuilder::Create()).TypeConstraint("T", JsepSupportedFloatTypes()), + ConvTranspose); ONNX_OPERATOR_VERSIONED_KERNEL_EX( ConvTranspose, kOnnxDomain, diff --git a/onnxruntime/core/providers/js/operators/pool.cc b/onnxruntime/core/providers/js/operators/pool.cc index 7fdb4e5d114ea..7df1e483f52a1 100644 --- a/onnxruntime/core/providers/js/operators/pool.cc +++ b/onnxruntime/core/providers/js/operators/pool.cc @@ -52,15 +52,20 @@ namespace js { Pool); POOLING_KERNEL_VERSIONED(AveragePool, kOnnxDomain, false, AveragePool, 7, 9) +POOLING_KERNEL_VERSIONED(AveragePool, kMSInternalNHWCDomain, true, AveragePool, 7, 9) POOLING_KERNEL_VERSIONED(AveragePool, kOnnxDomain, false, AveragePool, 10, 10) +POOLING_KERNEL_VERSIONED(AveragePool, kMSInternalNHWCDomain, true, AveragePool, 10, 10) POOLING_KERNEL(AveragePool, kOnnxDomain, false, AveragePool, 11) POOLING_KERNEL(AveragePool, kMSInternalNHWCDomain, true, AveragePool, 11) POOLING_KERNEL(GlobalAveragePool, kOnnxDomain, false, AveragePool, 1) POOLING_KERNEL(GlobalAveragePool, kMSInternalNHWCDomain, true, AveragePool, 1) POOLING_KERNEL_VERSIONED(MaxPool, kOnnxDomain, false, MaxPool<1>, 1, 7) +POOLING_KERNEL_VERSIONED(MaxPool, kMSInternalNHWCDomain, true, MaxPool<1>, 1, 7) POOLING_KERNEL_VERSIONED_WITH_INDICES(MaxPool, kOnnxDomain, false, MaxPool<8>, 8, 9) +POOLING_KERNEL_VERSIONED_WITH_INDICES(MaxPool, kMSInternalNHWCDomain, true, MaxPool<8>, 8, 9) POOLING_KERNEL_VERSIONED_WITH_INDICES(MaxPool, kOnnxDomain, false, MaxPool<8>, 10, 10) +POOLING_KERNEL_VERSIONED_WITH_INDICES(MaxPool, kMSInternalNHWCDomain, true, MaxPool<8>, 10, 10) POOLING_KERNEL_VERSIONED_WITH_INDICES(MaxPool, kOnnxDomain, false, MaxPool<8>, 11, 11) POOLING_KERNEL_VERSIONED_WITH_INDICES(MaxPool, kMSInternalNHWCDomain, true, MaxPool<8>, 11, 11) POOLING_KERNEL_WITH_INDICES(MaxPool, kOnnxDomain, false, MaxPool<8>, 12) diff --git a/onnxruntime/core/providers/js/operators/resize.cc b/onnxruntime/core/providers/js/operators/resize.cc index 5b2e385777a37..2514ab75dff26 100644 --- a/onnxruntime/core/providers/js/operators/resize.cc +++ b/onnxruntime/core/providers/js/operators/resize.cc @@ -51,6 +51,7 @@ namespace js { REGISTER_RESIZE_KERNEL(domain, 19); REGISTER_RESIZE_VERSIONED_10_10_KERNEL(kOnnxDomain); +REGISTER_RESIZE_VERSIONED_10_10_KERNEL(kMSInternalNHWCDomain); REGISTER_RESIZE_KERNEL_DOMAIN(kOnnxDomain); REGISTER_RESIZE_KERNEL_DOMAIN(kMSInternalNHWCDomain); diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/pad_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/pad_op_builder.cc index 2dfdfffe5fa54..fc8c5c357682c 100644 --- a/onnxruntime/core/providers/qnn/builder/opbuilder/pad_op_builder.cc +++ b/onnxruntime/core/providers/qnn/builder/opbuilder/pad_op_builder.cc @@ -202,16 +202,8 @@ Status PadOpBuilder::ProcessAttributesAndOutputs(QnnModelWrapper& qnn_model_wrap // Qnn format is begin_0, end_0, begin_1, end_1, ... ReArranagePads(pad_amount); - std::vector pad_amount_dim{static_cast(pad_amount.size() / 2), static_cast(2)}; - QnnParamWrapper multiples_param(node_unit.Index(), node_unit.Name(), QNN_OP_PAD_PARAM_PAD_AMOUNT, std::move(pad_amount_dim), - std::move(pad_amount)); - param_tensor_names.push_back(multiples_param.GetParamTensorName()); - qnn_model_wrapper.AddParamWrapper(std::move(multiples_param)); - - // Process optional input constant_value - if (node_unit.Inputs().size() > 2) { - ORT_RETURN_IF_ERROR(ProcessConstantValue(qnn_model_wrapper, param_tensor_names, node_unit, inputs[2])); - } // constant_value + std::vector input_shape; + ORT_RETURN_IF_NOT(qnn_model_wrapper.GetOnnxShape(inputs[0].node_arg, input_shape), "Cannot get shape of input 0."); NodeAttrHelper node_helper(node_unit); std::string mode = node_helper.Get("mode", "constant"); @@ -220,6 +212,10 @@ Status PadOpBuilder::ProcessAttributesAndOutputs(QnnModelWrapper& qnn_model_wrap if ("constant" == mode) { mode_qnn_scalar.uint32Value = QNN_OP_PAD_SCHEME_CONSTANT; } else if ("reflect" == mode) { + for (size_t i = 0; i < input_shape.size(); i++) { + ORT_RETURN_IF(pad_amount[i * 2] > input_shape[i] - 1 || pad_amount[(i * 2) + 1] > input_shape[i] - 1, + "Pad amount should not be greater than shape(input[0])[i] - 1"); + } mode_qnn_scalar.uint32Value = QNN_OP_PAD_SCHEME_MIRROR_REFLECT; } else if ("edge" == mode) { mode_qnn_scalar.uint32Value = QNN_OP_PAD_SCHEME_EDGE; @@ -227,10 +223,21 @@ Status PadOpBuilder::ProcessAttributesAndOutputs(QnnModelWrapper& qnn_model_wrap return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Pad mode only support constant."); } + std::vector pad_amount_dim{static_cast(pad_amount.size() / 2), static_cast(2)}; QnnParamWrapper mode_param(node_unit.Index(), node_unit.Name(), QNN_OP_PAD_PARAM_SCHEME, mode_qnn_scalar); param_tensor_names.push_back(mode_param.GetParamTensorName()); qnn_model_wrapper.AddParamWrapper(std::move(mode_param)); + QnnParamWrapper multiples_param(node_unit.Index(), node_unit.Name(), QNN_OP_PAD_PARAM_PAD_AMOUNT, + std::move(pad_amount_dim), std::move(pad_amount)); + param_tensor_names.push_back(multiples_param.GetParamTensorName()); + qnn_model_wrapper.AddParamWrapper(std::move(multiples_param)); + + // Process optional input constant_value + if (node_unit.Inputs().size() > 2) { + ORT_RETURN_IF_ERROR(ProcessConstantValue(qnn_model_wrapper, param_tensor_names, node_unit, inputs[2])); + } // constant_value + ORT_RETURN_IF_ERROR(ProcessOutputs(qnn_model_wrapper, node_unit, std::move(input_names), std::move(param_tensor_names), diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc index a1fc67ff60b6f..7f5ab3a772305 100644 --- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc +++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc @@ -365,6 +365,46 @@ std::unique_lock TensorrtExecutionProvider::GetApiLock() const { return std::unique_lock(singleton); } +Status GetShapeOfShapeTensor(Ort::ConstValue& input_tensor, + std::vector& shape_values, + nvinfer1::ICudaEngine* trt_engine, + int binding_index, + cudaStream_t stream) { + auto tensor_info = input_tensor.GetTensorTypeAndShapeInfo(); + const auto tensor_shapes = tensor_info.GetShape(); + const auto tensor_type = tensor_info.GetElementType(); + nvinfer1::Dims dims = trt_engine->getBindingDimensions(static_cast(binding_index)); + int nb_dims = dims.nbDims; + int shape_size = nb_dims == 0 ? 1 : static_cast(tensor_shapes[0]); // The shape of the "shape tensor" is either zero dimension (scalar) or 1-dimension + shape_values.resize(shape_size, 1); + + switch (tensor_type) { + case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT32: { + auto input = std::make_unique(shape_size); + CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(input.get(), input_tensor.GetTensorData(), shape_size * sizeof(int32_t), cudaMemcpyDeviceToHost, stream)); + CUDA_RETURN_IF_ERROR(cudaStreamSynchronize(stream)); + for (int j = 0; j < shape_size; ++j) { + shape_values[j] = input[j]; + } + break; + } + case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64: { + auto input = std::make_unique(shape_size); + CUDA_RETURN_IF_ERROR(cudaMemcpyAsync(input.get(), input_tensor.GetTensorData(), shape_size * sizeof(int64_t), cudaMemcpyDeviceToHost, stream)); + CUDA_RETURN_IF_ERROR(cudaStreamSynchronize(stream)); + for (int j = 0; j < shape_size; ++j) { + shape_values[j] = static_cast(input[j]); + } + break; + } + default: { + return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, + "TensorRT shape tensor data type: " + std::to_string(tensor_type) + " not supported."); + } + } + return Status::OK(); +} + /* * Apply TensorRT optimization profile shapes from provider options. * @@ -404,7 +444,7 @@ bool ApplyProfileShapesFromProviderOptions(std::vectorisShapeTensor()) { - auto shape_size = nb_dims; + int shape_size = nb_dims == 0 ? 1 : static_cast(profile_min_shapes[input_name][i].size()); std::vector shapes_min(shape_size), shapes_opt(shape_size), shapes_max(shape_size); LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] shape size of this shape tensor is " << shape_size; @@ -2758,7 +2798,17 @@ common::Status TensorrtExecutionProvider::Compile(const std::vectorisShapeBinding(binding_index)) { - trt_context->setInputShapeBinding(binding_index, &tensor_shape_values[input_name][0]); + // Get shape of the shape tensor + std::vector shape_values; + if (!tensor_shape_values[input_name].empty()) { + shape_values = tensor_shape_values[input_name]; + } else { + auto status = GetShapeOfShapeTensor(input_tensor, shape_values, trt_engine, binding_index, stream); + if (status != Status::OK()) { + return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, status.ErrorMessage()); + } + } + trt_context->setInputShapeBinding(binding_index, &shape_values[0]); } else { for (int j = 0, end = nb_dims; j < end; ++j) { dimensions.d[j] = static_cast(tensor_shapes[j]); diff --git a/onnxruntime/core/providers/xnnpack/detail/node_support_checker.cc b/onnxruntime/core/providers/xnnpack/detail/node_support_checker.cc index ea5c75a955cc4..8e7e228f974e6 100644 --- a/onnxruntime/core/providers/xnnpack/detail/node_support_checker.cc +++ b/onnxruntime/core/providers/xnnpack/detail/node_support_checker.cc @@ -7,22 +7,22 @@ #include "core/common/common.h" #include "core/framework/op_node_proto_helper.h" -#include "core/graph/graph_viewer.h" #include "core/graph/graph_utils.h" +#include "core/graph/graph_viewer.h" #include "core/providers/common.h" #include "core/providers/cpu/nn/pool_attributes.h" -#include "core/providers/xnnpack/detail/utils.h" #include "core/providers/shared/node_unit/node_unit.h" +#include "core/providers/xnnpack/detail/utils.h" // each operator provides a helper to check if supported -#include "core/providers/xnnpack/nn/conv.h" -#include "core/providers/xnnpack/nn/conv_transpose.h" -#include "core/providers/xnnpack/nn/max_pool.h" #include "core/providers/xnnpack/math/gemm.h" #include "core/providers/xnnpack/math/matmul.h" +#include "core/providers/xnnpack/math/softmax.h" #include "core/providers/xnnpack/nn/average_pool.h" -#include "core/providers/xnnpack/nn/resize.h" -#include "core/providers/xnnpack/nn/softmax.h" +#include "core/providers/xnnpack/nn/conv.h" +#include "core/providers/xnnpack/nn/conv_transpose.h" +#include "core/providers/xnnpack/nn/max_pool.h" +#include "core/providers/xnnpack/tensor/resize.h" namespace onnxruntime { namespace xnnpack { diff --git a/onnxruntime/core/providers/xnnpack/detail/utils.cc b/onnxruntime/core/providers/xnnpack/detail/utils.cc index baca4eef537d7..1a32612981120 100644 --- a/onnxruntime/core/providers/xnnpack/detail/utils.cc +++ b/onnxruntime/core/providers/xnnpack/detail/utils.cc @@ -25,7 +25,7 @@ const char* OpTypeToString(OpComputeType opCtype) { case op_compute_type_fp16: return "fp16"; case op_compute_type_qs8_per_channel: - return "qc8"; + return "qs8_qc8w"; case op_compute_type_qs8: return "qs8"; case op_compute_type_qu8: diff --git a/onnxruntime/core/providers/xnnpack/math/gemm.cc b/onnxruntime/core/providers/xnnpack/math/gemm.cc index 24c233e2415ca..f7b736b0ff903 100644 --- a/onnxruntime/core/providers/xnnpack/math/gemm.cc +++ b/onnxruntime/core/providers/xnnpack/math/gemm.cc @@ -78,7 +78,7 @@ bool Gemm::IsOnnxNodeSupported(const NodeUnit& node_unit, const GraphViewer& gra return supported; } -Gemm::Gemm(const OpKernelInfo& info) : GemmBase(info), XnnpackKernel(info) { +Gemm::Gemm(const OpKernelInfo& info) : GemmBase(info), XnnpackKernel(info, /*enable_caches*/ true) { const auto& node{Node()}; info.GetAttrOrDefault("alpha", &alpha_, 1.f); @@ -146,14 +146,9 @@ Status Gemm::PrePack(const Tensor& tensor, int input_idx, AllocatorPtr, trans_B_ == CblasNoTrans ? B_->Shape()[1] : B_->Shape()[0], // size_t output_stride, B_->Data(), // const float* kernel, bias_Data, // const float* bias, - output_min, - output_max, + output_min, output_max, flags, -#ifdef XNN_CACHE_ENABLE - &xnn_caches_, -#else - 0, -#endif + GetCodeCache(), GetWeightsCache(), &p); if (status != xnn_status_success) { @@ -165,20 +160,25 @@ Status Gemm::PrePack(const Tensor& tensor, int input_idx, AllocatorPtr, } Status Gemm::Compute(OpKernelContext* context) const { - pthreadpool_t t_pool = GetThreadPool(); + pthreadpool_t threadpool = GetThreadPool(); const auto* A = context->Input(0); auto Y = context->Output(0, {M_, N_}); // if input is empty tensor, return as nothing need to be calculated and we've set the shape for the output - if (M_ == 0 || N_ == 0) + if (M_ == 0 || N_ == 0) { return Status::OK(); + } + + xnn_status status = xnn_reshape_fully_connected_nc_f32(op0_.get(), + // Number of rows to multiply + trans_A_ == CblasNoTrans ? M_ : K_, + threadpool); + + if (status != xnn_status_success) { + return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "xnn_reshape_fully_connected_nc_f32 returned ", status); + } - xnn_status status = xnn_setup_fully_connected_nc_f32( - op0_.get(), - trans_A_ == CblasNoTrans ? M_ : K_, // Number of rows to multiply - A->Data(), - Y->MutableData(), - t_pool); + status = xnn_setup_fully_connected_nc_f32(op0_.get(), A->Data(), Y->MutableData()); if (status != xnn_status_success) { return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "xnn_setup_fully_connected_nc_f32 returned ", status); @@ -192,7 +192,15 @@ Status Gemm::Compute(OpKernelContext* context) const { return Status::OK(); } -ONNX_OPERATOR_VERSIONED_KERNEL_EX(Gemm, kOnnxDomain, 7, 12, kXnnpackExecutionProvider, +ONNX_OPERATOR_VERSIONED_KERNEL_EX(Gemm, kOnnxDomain, 7, 8, kXnnpackExecutionProvider, + KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType()), + Gemm); + +ONNX_OPERATOR_VERSIONED_KERNEL_EX(Gemm, kOnnxDomain, 9, 10, kXnnpackExecutionProvider, + KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType()), + Gemm); + +ONNX_OPERATOR_VERSIONED_KERNEL_EX(Gemm, kOnnxDomain, 11, 12, kXnnpackExecutionProvider, KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType()), Gemm); diff --git a/onnxruntime/core/providers/xnnpack/math/gemm.h b/onnxruntime/core/providers/xnnpack/math/gemm.h index 9191ba204bc25..6d11a8531c20f 100644 --- a/onnxruntime/core/providers/xnnpack/math/gemm.h +++ b/onnxruntime/core/providers/xnnpack/math/gemm.h @@ -41,14 +41,6 @@ class Gemm : protected GemmBase, public XnnpackKernel { float alpha_; float beta_; - -#ifdef XNN_CACHE_ENABLE -#if XNN_PLATFORM_JIT - xnn_code_cache code_cache_; -#endif - xnn_caches xnn_caches_ = {0, 0}; - xnn_weights_cache weights_cache_; -#endif }; } // namespace xnnpack diff --git a/onnxruntime/core/providers/xnnpack/math/matmul.cc b/onnxruntime/core/providers/xnnpack/math/matmul.cc index fc7335c79b603..e90aa11c9d087 100644 --- a/onnxruntime/core/providers/xnnpack/math/matmul.cc +++ b/onnxruntime/core/providers/xnnpack/math/matmul.cc @@ -62,7 +62,7 @@ bool MatMul::IsOnnxNodeSupported(const NodeUnit& node_unit, const GraphViewer& g return supported; } -MatMul::MatMul(const OpKernelInfo& info) : XnnpackKernel(info) {} +MatMul::MatMul(const OpKernelInfo& info) : XnnpackKernel(info, /*enable_caches*/ true) {} Status MatMul::PrePack(const Tensor& tensor, int input_idx, AllocatorPtr alloc, /*out*/ bool& is_packed, @@ -99,9 +99,11 @@ Status MatMul::PrePack(const Tensor& tensor, int input_idx, AllocatorPtr alloc, output_max, flags, #ifdef XNN_CACHE_ENABLE - &xnn_caches_, + GetCodeCache(), + GetWeightsCache(), #else - 0, + nullptr, + nullptr, #endif &p); @@ -116,7 +118,7 @@ Status MatMul::PrePack(const Tensor& tensor, int input_idx, AllocatorPtr alloc, Status MatMul::Compute(OpKernelContext* ctx) const { const Tensor* a = ctx->Input(0); - pthreadpool_t t_pool = GetThreadPool(); + pthreadpool_t threadpool = GetThreadPool(); MatMulComputeHelper helper; ORT_RETURN_IF_ERROR(helper.Compute(a->Shape(), b_shape_)); Tensor* y = ctx->Output(0, helper.OutputShape()); @@ -126,13 +128,12 @@ Status MatMul::Compute(OpKernelContext* ctx) const { auto* y_data = y->MutableData(); - xnn_status status = xnn_setup_fully_connected_nc_f32( - op0_.get(), - a->Shape()[0], - a->Data(), - y_data, - t_pool); + xnn_status status = xnn_reshape_fully_connected_nc_f32(op0_.get(), a->Shape()[0], threadpool); + if (status != xnn_status_success) { + return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "xnn_reshape_fully_connected_nc_f32 returned ", status); + } + status = xnn_setup_fully_connected_nc_f32(op0_.get(), a->Data(), y_data); if (status != xnn_status_success) { return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "xnn_setup_fully_connected_nc_f32 returned ", status); } @@ -144,7 +145,11 @@ Status MatMul::Compute(OpKernelContext* ctx) const { return Status::OK(); } -ONNX_OPERATOR_VERSIONED_KERNEL_EX(MatMul, kOnnxDomain, 1, 12, kXnnpackExecutionProvider, +ONNX_OPERATOR_VERSIONED_KERNEL_EX(MatMul, kOnnxDomain, 1, 8, kXnnpackExecutionProvider, + KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType()), + MatMul); + +ONNX_OPERATOR_VERSIONED_KERNEL_EX(MatMul, kOnnxDomain, 9, 12, kXnnpackExecutionProvider, KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType()), MatMul); diff --git a/onnxruntime/core/providers/xnnpack/math/matmul.h b/onnxruntime/core/providers/xnnpack/math/matmul.h index f4ed92c6146fb..b76e42c4d3729 100644 --- a/onnxruntime/core/providers/xnnpack/math/matmul.h +++ b/onnxruntime/core/providers/xnnpack/math/matmul.h @@ -32,14 +32,6 @@ class MatMul : public XnnpackKernel { AllocatorPtr myAlloc; XnnpackOperator op0_ = nullptr; - -#ifdef XNN_CACHE_ENABLE -#if XNN_PLATFORM_JIT - xnn_code_cache code_cache_; -#endif - xnn_caches xnn_caches_ = {0, 0}; - xnn_weights_cache weights_cache_; -#endif }; } // namespace xnnpack diff --git a/onnxruntime/core/providers/xnnpack/nn/softmax.cc b/onnxruntime/core/providers/xnnpack/math/softmax.cc similarity index 80% rename from onnxruntime/core/providers/xnnpack/nn/softmax.cc rename to onnxruntime/core/providers/xnnpack/math/softmax.cc index bca84317ad891..87440b7814176 100644 --- a/onnxruntime/core/providers/xnnpack/nn/softmax.cc +++ b/onnxruntime/core/providers/xnnpack/math/softmax.cc @@ -1,7 +1,7 @@ // Copyright (c) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. -#include "core/providers/xnnpack/nn/softmax.h" +#include "core/providers/xnnpack/math/softmax.h" #include @@ -25,6 +25,7 @@ bool IsQuantSoftmaxSupported(const NodeUnit& node_unit, const GraphViewer& graph output_type != TensorTypeUint8) { break; } + // to ensure its output scale and zp are 1/256 and 0, otherwise xnnpack EP has to do extra requantization // idealy, QlinearSoftmax or QDQSoftmax will keep this output scale and zp, but we have to handle some // qdq models converted from other framework @@ -33,6 +34,7 @@ bool IsQuantSoftmaxSupported(const NodeUnit& node_unit, const GraphViewer& graph if (fabs(q_scale.DataAsSpan()[0] - 1.0f / 256.0f) > 0.0001f) { break; } + if (zero_tensor) { Initializer q_zp(*zero_tensor, node_unit.ModelPath()); if (q_zp.DataAsSpan()[0] != 0) { @@ -57,6 +59,7 @@ bool Softmax::IsOnnxNodeSupported(const NodeUnit& node_unit, IsQuantSoftmaxSupported(node_unit, graph) == false) { return false; } + // use do {} while(false) so it's easier to set a breakpoint on the return do { // SoftMax has 1 input. @@ -133,6 +136,7 @@ Softmax::Softmax(const OpKernelInfo& info) : XnnpackKernel{info} { ORT_ENFORCE(status.IsOK(), "opset must be existed in attributes of QlinearSoftmax"); opset_ = gsl::narrow_cast(opset); } + int64_t axis = -1; Status status = info.GetAttr("axis", &axis); // our op checker function has ensured that axis must be the last dim @@ -162,23 +166,22 @@ Softmax::Softmax(const OpKernelInfo& info) : XnnpackKernel{info} { if (op_type_ == OpComputeType::op_compute_type_qu8) { // the order of input tensor, x,x_scale, x_zp, y_scale, y_zp OpQuantParam quant_param = ParseQuantParamForOp(info, x_dtype, 1); - xstatus = xnn_create_softmax_nc_qu8( - channels, - channels, - channels, - quant_param[0].first[0], // x_scale - quant_param[1].second, // y_zp - quant_param[1].first[0], // y_scale - 0, // flags, - &p); + xstatus = xnn_create_softmax_nc_qu8(channels, + channels, + channels, + quant_param[0].first[0], // x_scale + quant_param[1].second, // y_zp + quant_param[1].first[0], // y_scale + 0, // flags, + &p); } else if (op_type_ == OpComputeType::op_compute_type_fp32) { - xstatus = xnn_create_softmax_nc_f32( - channels, - channels, - channels, - 0, // flags, - &p); + xstatus = xnn_create_softmax_nc_f32(channels, + channels, + channels, + 0, // flags, + &p); } + ORT_ENFORCE(xstatus == xnn_status_success, "xnn_create_softmax_nc_", OpTypeToString(op_type_), " failed. Status:", xstatus); op0_.reset(p); @@ -194,39 +197,48 @@ Status Softmax::Compute(OpKernelContext* ctx) const { if (X_shape.Size() == 0) { return Status::OK(); } - pthreadpool_t t_pool = GetThreadPool(); + + pthreadpool_t threadpool = GetThreadPool(); const size_t N = X_shape.SizeToDimension(axis_); // const size_t D = X_shape.SizeFromDimension(axis_); // the step D is 1 xnn_status status = xnn_status_invalid_state; + + auto reshape_fn = op_type_ == OpComputeType::op_compute_type_qu8 ? xnn_reshape_softmax_nc_qu8 + : xnn_reshape_softmax_nc_f32; + status = reshape_fn(op0_.get(), N, threadpool); + + if (status != xnn_status_success) { + return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "xnn_reshape_softmax_nc_", OpTypeToString(op_type_), + " returned ", status); + } + if (op_type_ == OpComputeType::op_compute_type_qu8) { - status = xnn_setup_softmax_nc_qu8( - op0_.get(), - N, - X->Data(), - Y->MutableData(), - t_pool); + status = xnn_setup_softmax_nc_qu8(op0_.get(), X->Data(), Y->MutableData()); } else { - status = xnn_setup_softmax_nc_f32( - op0_.get(), - N, - X->Data(), - Y->MutableData(), - t_pool); + status = xnn_setup_softmax_nc_f32(op0_.get(), X->Data(), Y->MutableData()); } + if (status != xnn_status_success) { - return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "xnn_setup_softmax_nc_", - OpTypeToString(op_type_), " returned ", status); + return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "xnn_setup_softmax_nc_", OpTypeToString(op_type_), + " returned ", status); } - status = xnn_run_operator(op0_.get(), t_pool); + + status = xnn_run_operator(op0_.get(), threadpool); if (status != xnn_status_success) { return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "xnn_run_operator returned ", status); } + return Status::OK(); } -ONNX_OPERATOR_VERSIONED_KERNEL_EX(Softmax, kOnnxDomain, 1, 12, kXnnpackExecutionProvider, +ONNX_OPERATOR_VERSIONED_KERNEL_EX(Softmax, kOnnxDomain, 1, 10, kXnnpackExecutionProvider, KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType()), Softmax); + +ONNX_OPERATOR_VERSIONED_KERNEL_EX(Softmax, kOnnxDomain, 11, 12, kXnnpackExecutionProvider, + KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType()), + Softmax); + ONNX_OPERATOR_KERNEL_EX(Softmax, kOnnxDomain, 13, kXnnpackExecutionProvider, KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType()), Softmax); diff --git a/onnxruntime/core/providers/xnnpack/nn/softmax.h b/onnxruntime/core/providers/xnnpack/math/softmax.h similarity index 100% rename from onnxruntime/core/providers/xnnpack/nn/softmax.h rename to onnxruntime/core/providers/xnnpack/math/softmax.h diff --git a/onnxruntime/core/providers/xnnpack/nn/average_pool.cc b/onnxruntime/core/providers/xnnpack/nn/average_pool.cc index 767218fbfd20b..58c209a13cd0c 100644 --- a/onnxruntime/core/providers/xnnpack/nn/average_pool.cc +++ b/onnxruntime/core/providers/xnnpack/nn/average_pool.cc @@ -2,10 +2,13 @@ // Licensed under the MIT License. #include "core/providers/xnnpack/nn/average_pool.h" +#include + #include "core/common/status.h" #include "core/graph/graph.h" #include "core/providers/utils.h" #include "core/framework/tensorprotoutils.h" +#include "core/providers/xnnpack/xnnpack_init.h" #include "core/providers/xnnpack/detail/utils.h" namespace onnxruntime { @@ -90,6 +93,10 @@ bool AveragePool::IsOnnxNodeSupported(const NodeUnit& node_unit, const auto& inputs = node_unit.Inputs(); // use do {} while(false) so it's easier to set a breakpoint on the return do { + if (node_unit.SinceVersion() < 7) { + break; + } + // AveragePool has 1 input. const auto& x_arg = inputs[0].node_arg; @@ -141,6 +148,11 @@ bool AveragePool::IsOnnxNodeSupported(const NodeUnit& node_unit, break; } + // need dilations to all be 1 + if (!pool_attrs.default_dilations) { + break; + } + supported = true; } while (false); @@ -221,24 +233,47 @@ Status AveragePool::Compute(OpKernelContext* context) const { return Status::OK(); } - pthreadpool_t t_pool = GetThreadPool(); - xnn_status status = xnn_status_invalid_state; + pthreadpool_t threadpool = GetThreadPool(); + + // setup allocator/automated dellocate for workspace + size_t workspace_size = 0; + size_t workspace_alignment = 0; + xnn_allocator* allocator = GetStoredAllocator().second; + auto deallocator = [allocator](void* ptr) { allocator->aligned_deallocate(allocator->context, ptr); }; + + std::unique_ptr workspace(nullptr, deallocator); + + auto reshape_fn = (avgpool_type_ == OpComputeType::op_compute_type_fp32) + ? xnn_reshape_average_pooling2d_nhwc_f32 + : xnn_reshape_average_pooling2d_nhwc_qu8; + + auto status = reshape_fn(op0_.get(), N, H, W, + &workspace_size, &workspace_alignment, + /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, + threadpool); + + if (status != xnn_status_success) { + return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "xnn_reshape_average_pooling2d_nhwc_", OpTypeToString(avgpool_type_), + " returned ", status); + } + + workspace.reset(allocator->aligned_allocate(allocator->context, XNN_ALLOCATION_ALIGNMENT, workspace_size)); + if (avgpool_type_ == OpComputeType::op_compute_type_fp32) { - status = xnn_setup_average_pooling2d_nhwc_f32(op0_.get(), N, H, W, - X.Data(), Y.MutableData(), - t_pool /*threadpool */); + status = xnn_setup_average_pooling2d_nhwc_f32(op0_.get(), workspace.get(), + X.Data(), Y.MutableData()); + } else if (avgpool_type_ == OpComputeType::op_compute_type_qu8) { - status = xnn_setup_average_pooling2d_nhwc_qu8(op0_.get(), N, H, W, - X.Data(), Y.MutableData(), - t_pool /*threadpool */); + status = xnn_setup_average_pooling2d_nhwc_qu8(op0_.get(), workspace.get(), + X.Data(), Y.MutableData()); } if (status != xnn_status_success) { - return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "xnn_setup_average_pooling2d_nhwc_", - OpTypeToString(avgpool_type_), " returned ", status); + return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "xnn_setup_average_pooling2d_nhwc_", OpTypeToString(avgpool_type_), + " returned ", status); } - status = xnn_run_operator(op0_.get(), t_pool); + status = xnn_run_operator(op0_.get(), threadpool); if (status != xnn_status_success) { return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "xnn_run_operator returned ", status); } @@ -246,8 +281,26 @@ Status AveragePool::Compute(OpKernelContext* context) const { return Status::OK(); } +ONNX_OPERATOR_VERSIONED_KERNEL_EX( + AveragePool, kMSInternalNHWCDomain, 7, 9, + kXnnpackExecutionProvider, + KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType()), + AveragePool); + +ONNX_OPERATOR_VERSIONED_KERNEL_EX( + AveragePool, kMSInternalNHWCDomain, 10, 10, + kXnnpackExecutionProvider, + KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType()), + AveragePool); + +ONNX_OPERATOR_VERSIONED_KERNEL_EX( + AveragePool, kMSInternalNHWCDomain, 11, 18, + kXnnpackExecutionProvider, + KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType()), + AveragePool); + ONNX_OPERATOR_KERNEL_EX( - AveragePool, kMSInternalNHWCDomain, 11, + AveragePool, kMSInternalNHWCDomain, 19, kXnnpackExecutionProvider, KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType()), AveragePool); diff --git a/onnxruntime/core/providers/xnnpack/nn/conv.cc b/onnxruntime/core/providers/xnnpack/nn/conv.cc index 0772dec59e30e..0cdb9c840aa2d 100644 --- a/onnxruntime/core/providers/xnnpack/nn/conv.cc +++ b/onnxruntime/core/providers/xnnpack/nn/conv.cc @@ -3,12 +3,13 @@ #include "conv.h" +#include "core/common/gsl.h" #include "core/common/inlined_containers_fwd.h" +#include "core/framework/tensorprotoutils.h" #include "core/framework/transpose_helper.h" #include "core/providers/utils.h" +#include "core/providers/xnnpack/xnnpack_init.h" #include "core/providers/xnnpack/detail/utils.h" -#include "core/framework/tensorprotoutils.h" -#include "core/common/gsl.h" namespace onnxruntime { namespace xnnpack { @@ -64,21 +65,48 @@ Status Conv::Compute(OpKernelContext* context) const { if (Y->Shape().Size() == 0) { return Status::OK(); } - pthreadpool_t t_pool = GetThreadPool(); - xnn_status status = xnn_status_invalid_state; + pthreadpool_t threadpool = GetThreadPool(); + + // setup allocator/automated dellocate for workspace + size_t workspace_size = 0; + size_t workspace_alignment = 0; + xnn_allocator* allocator = GetStoredAllocator().second; + auto deallocator = [allocator](void* ptr) { allocator->aligned_deallocate(allocator->context, ptr); }; + std::unique_ptr workspace(nullptr, deallocator); + + auto reshape_fn = xnn_reshape_convolution2d_nhwc_f32; + if (conv_type_ == OpComputeType::op_compute_type_qs8) { + reshape_fn = xnn_reshape_convolution2d_nhwc_qs8; + } else if (conv_type_ == OpComputeType::op_compute_type_qu8) { + reshape_fn = xnn_reshape_convolution2d_nhwc_qu8; + } else if (conv_type_ == OpComputeType::op_compute_type_qs8_per_channel) { + reshape_fn = xnn_reshape_convolution2d_nhwc_qs8_qc8w; + } + + auto status = reshape_fn(op0_.get(), N, H, W, + &workspace_size, &workspace_alignment, + /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, + threadpool); + if (status != xnn_status_success) { + return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "xnn_reshape_convolution2d_nhwc_", OpTypeToString(conv_type_), + "returned ", status); + } + + workspace.reset(allocator->aligned_allocate(allocator->context, XNN_ALLOCATION_ALIGNMENT, workspace_size)); + if (conv_type_ == OpComputeType::op_compute_type_fp32) { - status = xnn_setup_convolution2d_nhwc_f32(op0_.get(), N, H, W, X.Data(), Y->MutableData(), - t_pool /*threadpool*/); + status = xnn_setup_convolution2d_nhwc_f32(op0_.get(), workspace.get(), X.Data(), + Y->MutableData()); } else if (conv_type_ == OpComputeType::op_compute_type_qs8) { - status = xnn_setup_convolution2d_nhwc_qs8(op0_.get(), N, H, W, X.Data(), Y->MutableData(), - t_pool /*threadpool*/); + status = xnn_setup_convolution2d_nhwc_qs8(op0_.get(), workspace.get(), X.Data(), + Y->MutableData()); } else if (conv_type_ == OpComputeType::op_compute_type_qu8) { - status = xnn_setup_convolution2d_nhwc_qu8(op0_.get(), N, H, W, X.Data(), Y->MutableData(), - t_pool /*threadpool*/); + status = xnn_setup_convolution2d_nhwc_qu8(op0_.get(), workspace.get(), X.Data(), + Y->MutableData()); } else if (conv_type_ == OpComputeType::op_compute_type_qs8_per_channel) { - status = xnn_setup_convolution2d_nhwc_qc8(op0_.get(), N, H, W, X.Data(), Y->MutableData(), - t_pool /*threadpool*/); + status = xnn_setup_convolution2d_nhwc_qs8_qc8w(op0_.get(), workspace.get(), X.Data(), + Y->MutableData()); } if (status != xnn_status_success) { @@ -86,7 +114,7 @@ Status Conv::Compute(OpKernelContext* context) const { OpTypeToString(conv_type_), "returned ", status); } - status = xnn_run_operator(op0_.get(), t_pool); + status = xnn_run_operator(op0_.get(), threadpool); if (status != xnn_status_success) { return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "xnn_run_operator returned ", status); } @@ -94,6 +122,10 @@ Status Conv::Compute(OpKernelContext* context) const { return Status::OK(); } +ONNX_OPERATOR_VERSIONED_KERNEL_EX(Conv, kMSInternalNHWCDomain, 1, 10, kXnnpackExecutionProvider, + KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType()), + Conv); + ONNX_OPERATOR_KERNEL_EX(Conv, kMSInternalNHWCDomain, 11, kXnnpackExecutionProvider, KernelDefBuilder().TypeConstraint("T", DataTypeImpl::GetTensorType()), Conv); diff --git a/onnxruntime/core/providers/xnnpack/nn/conv_base.cc b/onnxruntime/core/providers/xnnpack/nn/conv_base.cc index b692f373ff4ce..d21014569234e 100644 --- a/onnxruntime/core/providers/xnnpack/nn/conv_base.cc +++ b/onnxruntime/core/providers/xnnpack/nn/conv_base.cc @@ -23,7 +23,8 @@ Status CreateXnnpackKernel(const ConvAttributes* conv_attrs_ptr, const std::optional>& clip_min_max, const Tensor& Weight, const Tensor* Bias, XnnpackOperator& op_uptr, - xnn_caches_t caches_t, + xnn_code_cache_t code_cache, + xnn_weights_cache_t weights_cache, const OpQuantParam& quant_param, OpComputeType conv_type, bool is_transpose = false) { @@ -75,7 +76,7 @@ Status CreateXnnpackKernel(const ConvAttributes* conv_attrs_ptr, C, M, // input channel stride, output channel stride Weight.Data(), B_data, foutput_min, foutput_max, flags, - caches_t, + code_cache, weights_cache, &p); } else if (conv_type == OpComputeType::op_compute_type_qs8) { const float output_scale = quant_param[2].first[0]; @@ -99,7 +100,7 @@ Status CreateXnnpackKernel(const ConvAttributes* conv_attrs_ptr, quant_param[2].second, quant_param[2].first[0], output_min, output_max, flags, - caches_t, + code_cache, weights_cache, &p); } else if (conv_type == OpComputeType::op_compute_type_qs8_per_channel) { auto* B_data = Bias ? Bias->Data() : nullptr; @@ -107,7 +108,7 @@ Status CreateXnnpackKernel(const ConvAttributes* conv_attrs_ptr, const int8_t output_zero_point = quant_param[2].second; const int8_t output_min = xnn_u8s8_quantize(foutput_min, output_scale, output_zero_point); const int8_t output_max = xnn_u8s8_quantize(foutput_max, output_scale, output_zero_point); - status = xnn_create_convolution2d_nhwc_qc8( + status = xnn_create_convolution2d_nhwc_qs8_qc8w( input_padding_top, input_padding_right, input_padding_bottom, input_padding_left, kernel_height, kernel_width, subsampling_height, subsampling_width, @@ -123,7 +124,7 @@ Status CreateXnnpackKernel(const ConvAttributes* conv_attrs_ptr, quant_param[2].second, quant_param[2].first[0], output_min, output_max, flags, - caches_t, + code_cache, weights_cache, &p); } else if (conv_type == OpComputeType::op_compute_type_qu8) { const auto* B_data = Bias ? Bias->Data() : nullptr; @@ -148,15 +149,17 @@ Status CreateXnnpackKernel(const ConvAttributes* conv_attrs_ptr, quant_param[2].second, quant_param[2].first[0], output_min, output_max, flags, - caches_t, + code_cache, weights_cache, &p); } + if (status != xnn_status_success) { return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Failed to create xnnpack kernel. xnn_create_", is_transpose ? "deconvolution2d" : "convolution2d", "_nhwc_", OpTypeToString(conv_type), " returned ", status); } + op_uptr.reset(p); return Status::OK(); } @@ -296,6 +299,11 @@ bool ConvBase::IsOnnxNodeSupported(const NodeUnit& node_unit, const GraphViewer& const onnxruntime::Node& node = node_unit.GetNode(); // use do {} while(false) so it's easier to set a breakpoint on the return do { + // Internal NHWC domain starts at opset 11 + if (node_unit.SinceVersion() < 11) { + break; + } + // Conv has at least 2 inputs. const auto& inputs = node_unit.Inputs(); const auto& x_arg = inputs[0].node_arg; @@ -367,7 +375,7 @@ bool ConvBase::IsOnnxNodeSupported(const NodeUnit& node_unit, const GraphViewer& } ConvBase::ConvBase(const OpKernelInfo& info, bool is_transpose) - : XnnpackKernel(info), + : XnnpackKernel(info, /*enable_caches*/ true), conv_attrs_(info), conv_transpose_attrs_(info), convbase_attrs_ref_(is_transpose ? conv_transpose_attrs_ : conv_attrs_), @@ -383,16 +391,7 @@ ConvBase::ConvBase(const OpKernelInfo& info, bool is_transpose) } } } - // xnnpack cache_code, unfortunately these definitions are only available in xnnpack/cache.h, -#ifdef XNN_CACHE_ENABLE -#if XNN_PLATFORM_JIT - xnn_init_code_cache(&code_cache_); - xnn_caches_.code_cache = &code_cache_; -#endif - // TODO(Jicwen) enable weight-cache and code-cache - xnn_init_weights_cache(&weights_cache_); - xnn_caches_.weights_cache = &weights_cache_; -#endif + const auto& node{Node()}; const auto& input_defs = node.InputDefs(); const NodeArg& X = *input_defs[0]; @@ -477,11 +476,7 @@ ConvBase::ConvBase(const OpKernelInfo& info, bool is_transpose) Status ConvBase::CreateKernel() { auto ret = CreateXnnpackKernel(&convbase_attrs_ref_, C_, M_, kernel_shape_, clip_min_max_, packed_w_, B_, op0_, -#ifdef XNN_CACHE_ENABLE - &xnn_caches_, -#else - 0, -#endif + GetCodeCache(), GetWeightsCache(), quant_param_, conv_type_, is_transpose_); return ret; } diff --git a/onnxruntime/core/providers/xnnpack/nn/conv_base.h b/onnxruntime/core/providers/xnnpack/nn/conv_base.h index d3501a56ea24c..53ad51378c6be 100644 --- a/onnxruntime/core/providers/xnnpack/nn/conv_base.h +++ b/onnxruntime/core/providers/xnnpack/nn/conv_base.h @@ -39,14 +39,6 @@ class ConvBase : public XnnpackKernel { std::optional> clip_min_max_; XnnpackOperator op0_ = nullptr; - // we can't have the definition here because we can't import xnnpack/cache.h -#ifdef XNN_CACHE_ENABLE -#if XNN_PLATFORM_JIT - xnn_code_cache code_cache_; -#endif - xnn_caches xnn_caches_ = {0, 0}; - xnn_weights_cache weights_cache_; -#endif OpQuantParam quant_param_; OpComputeType conv_type_ = OpComputeType::op_compute_type_invalid; }; diff --git a/onnxruntime/core/providers/xnnpack/nn/conv_transpose.cc b/onnxruntime/core/providers/xnnpack/nn/conv_transpose.cc index 61d8f7f488547..8698c0739509d 100644 --- a/onnxruntime/core/providers/xnnpack/nn/conv_transpose.cc +++ b/onnxruntime/core/providers/xnnpack/nn/conv_transpose.cc @@ -81,29 +81,34 @@ Status ConvTranspose::Compute(OpKernelContext* context) const { if (Y->Shape().Size() == 0) { return Status::OK(); } - pthreadpool_t t_pool = GetThreadPool(); + pthreadpool_t threadpool = GetThreadPool(); auto output_pad_0 = gsl::narrow_cast(conv_transpose_attrs_.output_padding[0]); auto output_pad_1 = gsl::narrow_cast(conv_transpose_attrs_.output_padding[1]); xnn_status status = xnn_status_invalid_state; + + auto reshape_fn = xnn_reshape_deconvolution2d_nhwc_f32; + if (conv_type_ == OpComputeType::op_compute_type_qs8) { + reshape_fn = xnn_reshape_deconvolution2d_nhwc_qs8; + } else if (conv_type_ == OpComputeType::op_compute_type_qu8) { + reshape_fn = xnn_reshape_deconvolution2d_nhwc_qu8; + } + + status = reshape_fn(op0_.get(), N, H, W, output_pad_0, output_pad_1, + /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, + threadpool); + + if (status != xnn_status_success) { + return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "xnn_reshape_deconvolution2d_nhwc_", + OpTypeToString(conv_type_), " returned ", status); + } + if (conv_type_ == OpComputeType::op_compute_type_fp32) { - status = xnn_setup_deconvolution2d_nhwc_f32( - op0_.get(), N, H, W, - output_pad_0, - output_pad_1, X.Data(), Y->MutableData(), - t_pool /*threadpool*/); + status = xnn_setup_deconvolution2d_nhwc_f32(op0_.get(), X.Data(), Y->MutableData()); } else if (conv_type_ == OpComputeType::op_compute_type_qs8) { - status = xnn_setup_deconvolution2d_nhwc_qs8( - op0_.get(), N, H, W, - output_pad_0, - output_pad_1, X.Data(), Y->MutableData(), - t_pool /*threadpool*/); + status = xnn_setup_deconvolution2d_nhwc_qs8(op0_.get(), X.Data(), Y->MutableData()); } else if (conv_type_ == OpComputeType::op_compute_type_qu8) { - status = xnn_setup_deconvolution2d_nhwc_qu8( - op0_.get(), N, H, W, - output_pad_0, - output_pad_1, X.Data(), Y->MutableData(), - t_pool /*threadpool*/); + status = xnn_setup_deconvolution2d_nhwc_qu8(op0_.get(), X.Data(), Y->MutableData()); } if (status != xnn_status_success) { @@ -111,7 +116,7 @@ Status ConvTranspose::Compute(OpKernelContext* context) const { OpTypeToString(conv_type_), " returned ", status); } - status = xnn_run_operator(op0_.get(), t_pool); + status = xnn_run_operator(op0_.get(), threadpool); if (status != xnn_status_success) { return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "xnn_run_operator returned ", status); } diff --git a/onnxruntime/core/providers/xnnpack/nn/max_pool.cc b/onnxruntime/core/providers/xnnpack/nn/max_pool.cc index de6dd68bba9c3..2ef9f97f77b14 100644 --- a/onnxruntime/core/providers/xnnpack/nn/max_pool.cc +++ b/onnxruntime/core/providers/xnnpack/nn/max_pool.cc @@ -41,6 +41,10 @@ bool MaxPool::IsOnnxNodeSupported(const NodeUnit& node_unit, const onnxruntime::Node& node = node_unit.GetNode(); // use do {} while(false) so it's easier to set a breakpoint on the return do { + if (node_unit.SinceVersion() < 8) { + break; + } + // MaxPool has 1 input. auto input_defs = node.InputDefs(); const auto& x_arg = *input_defs[0]; @@ -220,20 +224,29 @@ Status MaxPool::Compute(OpKernelContext* context) const { return Status::OK(); } - pthreadpool_t t_pool = GetThreadPool(); - xnn_status status = xnn_status_invalid_state; + pthreadpool_t threadpool = GetThreadPool(); + + auto reshape_fn = xnn_reshape_max_pooling2d_nhwc_f32; + if (maxpool_type_ == OpComputeType::op_compute_type_qu8) + reshape_fn = xnn_reshape_max_pooling2d_nhwc_u8; + else if (maxpool_type_ == OpComputeType::op_compute_type_qs8) { + reshape_fn = xnn_reshape_max_pooling2d_nhwc_s8; + } + + auto status = reshape_fn(op0_.get(), N, H, W, + /*output_height_out=*/nullptr, /*output_width_out=*/nullptr, + threadpool); + if (status != xnn_status_success) { + return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "xnn_reshape_max_pooling2d_nhwc_", + OpTypeToString(maxpool_type_), " returned ", status); + } + if (maxpool_type_ == OpComputeType::op_compute_type_fp32) { - status = xnn_setup_max_pooling2d_nhwc_f32(op0_.get(), N, H, W, - X.Data(), Y->MutableData(), - t_pool /*threadpool */); + status = xnn_setup_max_pooling2d_nhwc_f32(op0_.get(), X.Data(), Y->MutableData()); } else if (maxpool_type_ == OpComputeType::op_compute_type_qu8) { - status = xnn_setup_max_pooling2d_nhwc_u8(op0_.get(), N, H, W, - X.Data(), Y->MutableData(), - t_pool /*threadpool */); + status = xnn_setup_max_pooling2d_nhwc_u8(op0_.get(), X.Data(), Y->MutableData()); } else { - status = xnn_setup_max_pooling2d_nhwc_s8(op0_.get(), N, H, W, - X.Data(), Y->MutableData(), - t_pool /*threadpool */); + status = xnn_setup_max_pooling2d_nhwc_s8(op0_.get(), X.Data(), Y->MutableData()); } if (status != xnn_status_success) { @@ -241,7 +254,7 @@ Status MaxPool::Compute(OpKernelContext* context) const { OpTypeToString(maxpool_type_), " returned ", status); } - status = xnn_run_operator(op0_.get(), t_pool); + status = xnn_run_operator(op0_.get(), threadpool); if (status != xnn_status_success) { return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "xnn_run_operator returned ", status); } @@ -249,12 +262,24 @@ Status MaxPool::Compute(OpKernelContext* context) const { return Status::OK(); } -ONNX_OPERATOR_VERSIONED_KERNEL_EX( - MaxPool, kMSInternalNHWCDomain, 11, 11, kXnnpackExecutionProvider, - KernelDefBuilder().TypeConstraint("T", {DataTypeImpl::GetTensorType(), - DataTypeImpl::GetTensorType(), - DataTypeImpl::GetTensorType()}), - MaxPool); +ONNX_OPERATOR_VERSIONED_KERNEL_EX(MaxPool, kMSInternalNHWCDomain, 8, 9, kXnnpackExecutionProvider, + KernelDefBuilder().TypeConstraint("T", {DataTypeImpl::GetTensorType(), + DataTypeImpl::GetTensorType(), + DataTypeImpl::GetTensorType()}), + MaxPool); + +ONNX_OPERATOR_VERSIONED_KERNEL_EX(MaxPool, kMSInternalNHWCDomain, 10, 10, kXnnpackExecutionProvider, + KernelDefBuilder().TypeConstraint("T", {DataTypeImpl::GetTensorType(), + DataTypeImpl::GetTensorType(), + DataTypeImpl::GetTensorType()}), + MaxPool); + +ONNX_OPERATOR_VERSIONED_KERNEL_EX(MaxPool, kMSInternalNHWCDomain, 11, 11, kXnnpackExecutionProvider, + KernelDefBuilder().TypeConstraint("T", {DataTypeImpl::GetTensorType(), + DataTypeImpl::GetTensorType(), + DataTypeImpl::GetTensorType()}), + MaxPool); + ONNX_OPERATOR_KERNEL_EX(MaxPool, kMSInternalNHWCDomain, 12, kXnnpackExecutionProvider, KernelDefBuilder() .TypeConstraint("T", {DataTypeImpl::GetTensorType(), diff --git a/onnxruntime/core/providers/xnnpack/nn/resize.cc b/onnxruntime/core/providers/xnnpack/tensor/resize.cc similarity index 67% rename from onnxruntime/core/providers/xnnpack/nn/resize.cc rename to onnxruntime/core/providers/xnnpack/tensor/resize.cc index 76c6b6acbfe32..0c9e2e9fc17a2 100644 --- a/onnxruntime/core/providers/xnnpack/nn/resize.cc +++ b/onnxruntime/core/providers/xnnpack/tensor/resize.cc @@ -1,7 +1,7 @@ // Copyright (c) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. -#include "core/providers/xnnpack/nn/resize.h" +#include "core/providers/xnnpack/tensor/resize.h" #include #include @@ -10,6 +10,7 @@ #include "core/common/inlined_containers_fwd.h" #include "core/framework/op_kernel.h" #include "core/optimizer/initializer.h" +#include "core/providers/xnnpack/xnnpack_init.h" namespace onnxruntime { namespace xnnpack { @@ -18,26 +19,67 @@ bool Resize::IsOnnxNodeSupported(const NodeUnit& node_unit, const GraphViewer& graph_viewer) { bool supported = false; do { + if (node_unit.SinceVersion() < 10) { + break; + } + // Resize has 1-4 input. const auto& inputs = node_unit.Inputs(); const auto& x_arg = inputs[0].node_arg; const auto* x_type = x_arg.TypeAsProto(); - if (x_type == nullptr || - (x_type->tensor_type().elem_type() != ONNX_NAMESPACE::TensorProto_DataType_FLOAT && - x_type->tensor_type().elem_type() != ONNX_NAMESPACE::TensorProto_DataType_UINT8 && - x_type->tensor_type().elem_type() != ONNX_NAMESPACE::TensorProto_DataType_INT8)) { + if (x_type == nullptr || (x_type->tensor_type().elem_type() != ONNX_NAMESPACE::TensorProto_DataType_FLOAT && + x_type->tensor_type().elem_type() != ONNX_NAMESPACE::TensorProto_DataType_UINT8 && + x_type->tensor_type().elem_type() != ONNX_NAMESPACE::TensorProto_DataType_INT8)) { break; } const auto* x_shape = x_arg.Shape(); - //'bilinear' == 2-D input or 4-D input with outermost 2 scales as 1 (NCHW) or - // 4-D input with outermost and innermost scales as 1 (NHWC) - // but we just support 4-d tensor for now, and the channel must be known. + + // 'bilinear' == 2-D input or 4-D input with outermost 2 scales as 1 (NCHW) can be supported. + // we only support 4-d tensor for now, and the channel must be known. + // we assume the input in NCHW for this test. if (!x_shape || x_shape->dim_size() != 4 || x_shape->dim(1).dim_value() <= 0) { break; } + // validate it is in fact NCHW + // + // opset 10 had `scales` as input 1 and no sizes. later opsets added roi as input 1 followed by scales and sizes. + auto opset_version = node_unit.SinceVersion(); + size_t scale_idx = opset_version == 10 ? 1 : 2; + size_t size_idx = 3; + + // onnx shape inferencing validates that one and not both of sizes and scales are provided + const auto* scale_tensor = inputs.size() >= scale_idx + 1 + ? graph_viewer.GetConstantInitializer(inputs[scale_idx].node_arg.Name(), true) + : nullptr; + const auto* size_tensor = opset_version > 10 && inputs.size() >= size_idx + 1 + ? graph_viewer.GetConstantInitializer(inputs[size_idx].node_arg.Name(), true) + : nullptr; + + // if both scales and sizes are nullptr the one that was provided was not a constant initializer + if (!scale_tensor && !size_tensor) { + break; + } + + // check the scale for the second dim is 1 or the size of the second dim matches the input shape. + // if not, it is not the C dim as a Resize will not change the number of channels. + InlinedVector scale(4, 1.0F); + if (scale_tensor) { + const Initializer scale_val(*scale_tensor, node_unit.ModelPath()); + if (scale_val.DataAsSpan()[1] != 1.0F) { + break; + } + } + + if (size_tensor) { + const Initializer size_val(*size_tensor, node_unit.ModelPath()); + if (size_val.DataAsSpan()[1] != x_shape->dim(1).dim_value()) { + break; + } + } + const auto* output_shape = node_unit.Outputs()[0].node_arg.Shape(); bool length_resized_compatible_pytorch_half_pixel = true; // when length_resized > 1, there is no difference between pytorch_half_pixel and half_pixel @@ -48,18 +90,11 @@ bool Resize::IsOnnxNodeSupported(const NodeUnit& node_unit, // if coordinate_transformation_mode is "pytorch_half_pixel", // x_original = length_resized > 1 ? (x_resized + 0.5) / scale - 0.5 : 0 // - if (output_shape->dim(2).dim_value() <= 1 || output_shape->dim(1).dim_value() <= 1) { + if (output_shape->dim(2).dim_value() <= 1 || output_shape->dim(3).dim_value() <= 1) { + // we don't know the output H or W so we don't know if it will be compatible length_resized_compatible_pytorch_half_pixel = false; } - // Refer to onnxruntime/core/providers/cpu/tensor/upsamplebase.h, - size_t scale_idx = 2; - size_t size_idx = 3; - auto opset_version = node_unit.SinceVersion(); - if (opset_version == 10) { - scale_idx = 1; - } - ProtoHelperNodeContext nc(node_unit.GetNode()); OpNodeProtoHelper info(&nc); @@ -78,6 +113,7 @@ bool Resize::IsOnnxNodeSupported(const NodeUnit& node_unit, std::vector axes; if (info.GetAttrs("axes", axes).IsOK() && axes.size() > 0) { + // TODO: We should be able to handle this if required break; } @@ -95,9 +131,10 @@ bool Resize::IsOnnxNodeSupported(const NodeUnit& node_unit, // Coordinate transformation mode attr was introduced in version 11. // before that asymmetric mode was the only available transformation mode std::string coordinate_transform_mode_name = - opset_version > 10 - ? info.GetAttrOrDefault("coordinate_transformation_mode", "half_pixel") - : "asymmetric"; + opset_version > 10 ? info.GetAttrOrDefault("coordinate_transformation_mode", "half_pixel") + : "asymmetric"; + + // TODO: Opset 19 added half_pixel_symmetric. Need to see if that can be supported. if (coordinate_transform_mode_name != "asymmetric" && coordinate_transform_mode_name != "half_pixel" && @@ -106,59 +143,7 @@ bool Resize::IsOnnxNodeSupported(const NodeUnit& node_unit, break; } - auto exclude_outside = info.GetAttrOrDefault("exclude_outside", 0) == 0 ? false : true; - if (exclude_outside) { - break; - } - - // roi only takes effect when coordinate_transformation_mode is "tf_crop_and_resize" - - // size or scales shouldnt't be provided in the same time but should at least be provided one of them - const auto* scale_tensor = inputs.size() >= scale_idx + 1 - ? graph_viewer.GetConstantInitializer(inputs[scale_idx].node_arg.Name(), true) - : nullptr; - const auto* size_tensor = inputs.size() >= size_idx + 1 - ? graph_viewer.GetConstantInitializer(inputs[size_idx].node_arg.Name(), true) - : nullptr; - - bool has_size = false; - bool has_scale = false; - InlinedVector scale(4, 1.0F); - if (scale_tensor) { - const Initializer scale_val(*scale_tensor, node_unit.ModelPath()); - auto scale_span = scale_val.DataAsSpan(); - if (scale_span.size() == 4) { - has_scale = true; - std::copy(scale_span.begin(), scale_span.end(), scale.begin()); - } - } - - if (size_tensor) { - auto input_shape = utils::GetTensorShapeFromTensorShapeProto(*x_shape); - const Initializer size_val(*size_tensor, node_unit.ModelPath()); - - auto size_span = size_val.DataAsSpan(); - if (size_span.size() == 4) { - has_size = true; - scale = {size_span[0] / static_cast(input_shape[0]), - size_span[1] / static_cast(input_shape[1]), - size_span[2] / static_cast(input_shape[2]), - size_span[3] / static_cast(input_shape[3])}; - } - } - - if ((has_size && has_scale) || (!has_size && !has_scale)) { - break; - } - - if (scale[0] != 1.0F || (scale[1] != 1.0F && scale[3] != 1.0F)) { - break; - } - - // only support xnn_create_resize_bilinear2d_nchw_f32 - const bool is_NHWC = scale[3] == 1.0F; - if (!is_NHWC && (x_type->tensor_type().elem_type() == ONNX_NAMESPACE::TensorProto_DataType_UINT8 || - x_type->tensor_type().elem_type() == ONNX_NAMESPACE::TensorProto_DataType_INT8)) { + if (info.GetAttrOrDefault("exclude_outside", 0) != 0) { break; } @@ -210,8 +195,7 @@ Resize::Resize(const OpKernelInfo& info) : UpsampleBase(info), XnnpackKernel{inf } } - is_NHWC_ = scales_[3] == 1.0F; - int64_t channels = x_shape->dim(is_NHWC_ ? 3 : 1).dim_value(); + int64_t channels = x_shape->dim(3).dim_value(); uint32_t flags = 0; ORT_ENFORCE(mode_ == UpsampleMode::LINEAR, "only support bilinear resize"); @@ -225,18 +209,16 @@ Resize::Resize(const OpKernelInfo& info) : UpsampleBase(info), XnnpackKernel{inf xnn_status xstatus = xnn_status_invalid_state; struct xnn_operator* p = nullptr; if (op_type_ == OpComputeType::op_compute_type_fp32) { - auto create_func = is_NHWC_ ? xnn_create_resize_bilinear2d_nhwc_f32 : xnn_create_resize_bilinear2d_nchw_f32; - xstatus = create_func( - channels, channels, channels, flags, &p); + xstatus = xnn_create_resize_bilinear2d_nhwc_f32(channels, channels, channels, flags, &p); } else if (op_type_ == OpComputeType::op_compute_type_qu8) { - xstatus = xnn_create_resize_bilinear2d_nhwc_u8( - channels, channels, channels, flags, &p); + xstatus = xnn_create_resize_bilinear2d_nhwc_u8(channels, channels, channels, flags, &p); } else { - xstatus = xnn_create_resize_bilinear2d_nhwc_s8( - channels, channels, channels, flags, &p); + xstatus = xnn_create_resize_bilinear2d_nhwc_s8(channels, channels, channels, flags, &p); } - ORT_ENFORCE(xstatus == xnn_status_success, "xnn_create_resize_bilinear2d_nhwc_", - OpTypeToString(op_type_), " failed. Status:", xstatus); + + ORT_ENFORCE(xstatus == xnn_status_success, "xnn_create_resize_bilinear2d_nhwc_", OpTypeToString(op_type_), " failed. Status:", + xstatus); + op0_.reset(p); } @@ -245,48 +227,56 @@ Status Resize::ComputeInternal(OpKernelContext* ctx, const Tensor* input, const TensorShapeVector& output_dims) const { const auto& X_shape = input->Shape(); auto N = X_shape[0]; - auto H = is_NHWC_ ? X_shape[1] : X_shape[2]; - auto W = is_NHWC_ ? X_shape[2] : X_shape[3]; + auto H = X_shape[1]; + auto W = X_shape[2]; Tensor* output = ctx->Output(0, TensorShape(output_dims)); - pthreadpool_t t_pool = GetThreadPool(); - xnn_status status = xnn_status_invalid_state; + pthreadpool_t threadpool = GetThreadPool(); + + // setup allocator/automated dellocate for workspace + size_t workspace_size = 0; + size_t workspace_alignment = 0; + xnn_allocator* allocator = GetStoredAllocator().second; + auto deallocator = [allocator](void* ptr) { allocator->aligned_deallocate(allocator->context, ptr); }; + std::unique_ptr workspace(nullptr, deallocator); + + auto reshape_fn = xnn_reshape_resize_bilinear2d_nhwc_f32; + if (op_type_ == OpComputeType::op_compute_type_qu8) { + reshape_fn = xnn_reshape_resize_bilinear2d_nhwc_u8; + } else if (op_type_ == OpComputeType::op_compute_type_qs8) { + reshape_fn = xnn_reshape_resize_bilinear2d_nhwc_s8; + } + + auto status = reshape_fn(op0_.get(), N, H, W, output_dims[1], output_dims[2], + &workspace_size, &workspace_alignment, threadpool); + if (status != xnn_status_success) { + return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "xnn_reshape_resize_bilinear2d_nhwc_", OpTypeToString(op_type_), + " returned ", status); + } + + workspace.reset(allocator->aligned_allocate(allocator->context, XNN_ALLOCATION_ALIGNMENT, workspace_size)); + if (op_type_ == OpComputeType::op_compute_type_fp32) { - auto oH = is_NHWC_ ? output_dims[1] : output_dims[2]; - auto oW = is_NHWC_ ? output_dims[2] : output_dims[3]; - auto setup_func = is_NHWC_ ? xnn_setup_resize_bilinear2d_nhwc_f32 : xnn_setup_resize_bilinear2d_nchw_f32; - status = setup_func( - op0_.get(), - N, - H, W, oH, oW, - input->Data(), - output->MutableData(), - t_pool); + status = xnn_setup_resize_bilinear2d_nhwc_f32(op0_.get(), workspace.get(), input->Data(), + output->MutableData()); } else if (op_type_ == OpComputeType::op_compute_type_qu8) { - status = xnn_setup_resize_bilinear2d_nhwc_u8( - op0_.get(), - N, - H, W, output_dims[1], output_dims[2], - input->Data(), - output->MutableData(), - t_pool); + status = xnn_setup_resize_bilinear2d_nhwc_u8(op0_.get(), workspace.get(), input->Data(), + output->MutableData()); } else { - status = xnn_setup_resize_bilinear2d_nhwc_s8( - op0_.get(), - N, - H, W, output_dims[1], output_dims[2], - input->Data(), - output->MutableData(), - t_pool); + status = xnn_setup_resize_bilinear2d_nhwc_s8(op0_.get(), workspace.get(), input->Data(), + output->MutableData()); } + if (status != xnn_status_success) { return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "xnn_setup_resize_bilinear2d_nhwc_", OpTypeToString(op_type_), " returned ", status); } - status = xnn_run_operator(op0_.get(), t_pool); + + status = xnn_run_operator(op0_.get(), threadpool); if (status != xnn_status_success) { return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "xnn_run_operator returned ", status); } + return Status::OK(); } @@ -315,29 +305,29 @@ Status Resize::Compute(OpKernelContext* ctx) const { return ComputeInternal(ctx, X, output_shape); } -ONNX_OPERATOR_VERSIONED_KERNEL_EX(Resize, kOnnxDomain, 10, 10, kXnnpackExecutionProvider, +ONNX_OPERATOR_VERSIONED_KERNEL_EX(Resize, kMSInternalNHWCDomain, 10, 10, kXnnpackExecutionProvider, KernelDefBuilder().TypeConstraint("T", {DataTypeImpl::GetTensorType(), DataTypeImpl::GetTensorType(), DataTypeImpl::GetTensorType()}), Resize); -ONNX_OPERATOR_VERSIONED_KERNEL_EX(Resize, kOnnxDomain, 11, 12, kXnnpackExecutionProvider, +ONNX_OPERATOR_VERSIONED_KERNEL_EX(Resize, kMSInternalNHWCDomain, 11, 12, kXnnpackExecutionProvider, KernelDefBuilder().TypeConstraint("T1", {DataTypeImpl::GetTensorType(), DataTypeImpl::GetTensorType(), DataTypeImpl::GetTensorType()}), Resize); -ONNX_OPERATOR_VERSIONED_KERNEL_EX(Resize, kOnnxDomain, 13, 17, kXnnpackExecutionProvider, +ONNX_OPERATOR_VERSIONED_KERNEL_EX(Resize, kMSInternalNHWCDomain, 13, 17, kXnnpackExecutionProvider, KernelDefBuilder().TypeConstraint("T1", {DataTypeImpl::GetTensorType(), DataTypeImpl::GetTensorType(), DataTypeImpl::GetTensorType()}), Resize); -ONNX_OPERATOR_VERSIONED_KERNEL_EX(Resize, kOnnxDomain, 18, 18, kXnnpackExecutionProvider, +ONNX_OPERATOR_VERSIONED_KERNEL_EX(Resize, kMSInternalNHWCDomain, 18, 18, kXnnpackExecutionProvider, KernelDefBuilder().TypeConstraint("T1", {DataTypeImpl::GetTensorType(), DataTypeImpl::GetTensorType(), DataTypeImpl::GetTensorType()}), Resize); -ONNX_OPERATOR_KERNEL_EX(Resize, kOnnxDomain, 19, kXnnpackExecutionProvider, +ONNX_OPERATOR_KERNEL_EX(Resize, kMSInternalNHWCDomain, 19, kXnnpackExecutionProvider, KernelDefBuilder().TypeConstraint("T1", {DataTypeImpl::GetTensorType(), DataTypeImpl::GetTensorType(), DataTypeImpl::GetTensorType()}), diff --git a/onnxruntime/core/providers/xnnpack/nn/resize.h b/onnxruntime/core/providers/xnnpack/tensor/resize.h similarity index 98% rename from onnxruntime/core/providers/xnnpack/nn/resize.h rename to onnxruntime/core/providers/xnnpack/tensor/resize.h index 4975510ee7db4..06ff1bdb61f59 100644 --- a/onnxruntime/core/providers/xnnpack/nn/resize.h +++ b/onnxruntime/core/providers/xnnpack/tensor/resize.h @@ -31,7 +31,6 @@ class Resize : public UpsampleBase, public XnnpackKernel { const TensorShapeVector& output_dims) const; private: - bool is_NHWC_; XnnpackOperator op0_; TensorShapeVector output_dims_; OpComputeType op_type_ = OpComputeType::op_compute_type_invalid; diff --git a/onnxruntime/core/providers/xnnpack/xnnpack_execution_provider.cc b/onnxruntime/core/providers/xnnpack/xnnpack_execution_provider.cc index 494c718cde081..a2a776df439e4 100644 --- a/onnxruntime/core/providers/xnnpack/xnnpack_execution_provider.cc +++ b/onnxruntime/core/providers/xnnpack/xnnpack_execution_provider.cc @@ -27,88 +27,117 @@ KernelCreateInfo BuildKernelCreateInfo() { return info; } -#define KERNEL_CREATE_INFO_VERSIONED(Start, End, Op) \ - BuildKernelCreateInfo< \ - ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kMSInternalNHWCDomain, Start, End, Op)> +#define KERNEL_CREATE_INFO_VERSIONED(Start, End, Op, Domain) \ + BuildKernelCreateInfo< \ + ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, Domain, Start, End, Op)> -#define KERNEL_CREATE_INFO(Start, Op) \ - BuildKernelCreateInfo< \ - ONNX_OPERATOR_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kMSInternalNHWCDomain, Start, Op)> +#define KERNEL_CREATE_INFO(Start, Op, Domain) \ + BuildKernelCreateInfo< \ + ONNX_OPERATOR_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, Domain, Start, Op)> -#define KERNEL_CREATE_INFO_TYPED(Start, type, Op) \ - BuildKernelCreateInfo< \ - ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kMSInternalNHWCDomain, Start, type, Op)> +#define KERNEL_CREATE_INFO_TYPED(Start, Type, Op, Domain) \ + BuildKernelCreateInfo< \ + ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, Domain, Start, Type, Op)> +// Layout sensitive operators in NHWC domain +class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kMSInternalNHWCDomain, 7, 9, AveragePool); +class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kMSInternalNHWCDomain, 10, 10, AveragePool); +class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kMSInternalNHWCDomain, 11, 18, AveragePool); +class ONNX_OPERATOR_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kMSInternalNHWCDomain, 19, AveragePool); + +class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kMSInternalNHWCDomain, 1, 10, Conv); class ONNX_OPERATOR_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kMSInternalNHWCDomain, 11, Conv); -class ONNX_OPERATOR_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kMSInternalNHWCDomain, 11, ConvTranspose); + class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kMSInternalNHWCDomain, 1, 10, ConvTranspose); -class ONNX_OPERATOR_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kMSInternalNHWCDomain, 1, QLinearConvTranspose); -class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kOnnxDomain, 10, 10, Resize); -class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kOnnxDomain, 11, 12, Resize); -class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kOnnxDomain, 13, 17, Resize); -class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kOnnxDomain, 18, 18, Resize); -class ONNX_OPERATOR_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kOnnxDomain, 19, Resize); -class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kMSInternalNHWCDomain, 11, 11, MaxPool); -class ONNX_OPERATOR_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kMSInternalNHWCDomain, 12, MaxPool); -class ONNX_OPERATOR_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kMSInternalNHWCDomain, 11, AveragePool); -class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kOnnxDomain, 1, 12, Softmax); -class ONNX_OPERATOR_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kOnnxDomain, 13, Softmax); +class ONNX_OPERATOR_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kMSInternalNHWCDomain, 11, ConvTranspose); class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kMSInternalNHWCDomain, 10, uint8_t, QLinearConv); class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kMSInternalNHWCDomain, 10, int8_t, QLinearConv); + +class ONNX_OPERATOR_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kMSInternalNHWCDomain, 1, QLinearConvTranspose); + class ONNX_OPERATOR_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kMSInternalNHWCDomain, 1, QLinearAveragePool); -class ONNX_OPERATOR_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, - kDynamicDomainByCreate, 1, QLinearSoftmax); -class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kOnnxDomain, 7, 12, Gemm); +class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kMSInternalNHWCDomain, 10, 10, Resize); +class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kMSInternalNHWCDomain, 11, 12, Resize); +class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kMSInternalNHWCDomain, 13, 17, Resize); +class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kMSInternalNHWCDomain, 18, 18, Resize); +class ONNX_OPERATOR_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kMSInternalNHWCDomain, 19, Resize); + +class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kMSInternalNHWCDomain, 8, 9, MaxPool); +class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kMSInternalNHWCDomain, 10, 10, MaxPool); +class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kMSInternalNHWCDomain, 11, 11, MaxPool); +class ONNX_OPERATOR_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kMSInternalNHWCDomain, 12, MaxPool); + +// ONNX operators +class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kOnnxDomain, 7, 8, Gemm); +class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kOnnxDomain, 9, 10, Gemm); +class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kOnnxDomain, 11, 12, Gemm); class ONNX_OPERATOR_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kOnnxDomain, 13, Gemm); -class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kOnnxDomain, 1, 12, MatMul); +class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kOnnxDomain, 1, 8, MatMul); +class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kOnnxDomain, 9, 12, MatMul); class ONNX_OPERATOR_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kOnnxDomain, 13, MatMul); +class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kOnnxDomain, 1, 10, Softmax); +class ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kOnnxDomain, 11, 12, Softmax); +class ONNX_OPERATOR_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kOnnxDomain, 13, Softmax); + +// Internal domain +class ONNX_OPERATOR_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kDynamicDomainByCreate, 1, QLinearSoftmax); + std::unique_ptr RegisterKernels() { auto kernel_registry = std::make_unique(); static const BuildKernelCreateInfoFn function_table[] = { BuildKernelCreateInfo, // default entry to avoid the list becoming empty after ops-reducing - KERNEL_CREATE_INFO(11, Conv), - KERNEL_CREATE_INFO(11, ConvTranspose), - KERNEL_CREATE_INFO_VERSIONED(1, 10, ConvTranspose), - KERNEL_CREATE_INFO(1, QLinearConvTranspose), - KERNEL_CREATE_INFO_VERSIONED(11, 11, MaxPool), - KERNEL_CREATE_INFO(12, MaxPool), - KERNEL_CREATE_INFO(11, AveragePool), + // layout sensitive. nodes will be moved to kMSInternalNHWCDomain by layout transformation + KERNEL_CREATE_INFO_VERSIONED(7, 9, AveragePool, kMSInternalNHWCDomain), + KERNEL_CREATE_INFO_VERSIONED(10, 10, AveragePool, kMSInternalNHWCDomain), + KERNEL_CREATE_INFO_VERSIONED(11, 18, AveragePool, kMSInternalNHWCDomain), + KERNEL_CREATE_INFO(19, AveragePool, kMSInternalNHWCDomain), + + KERNEL_CREATE_INFO_VERSIONED(1, 10, Conv, kMSInternalNHWCDomain), + KERNEL_CREATE_INFO(11, Conv, kMSInternalNHWCDomain), + + KERNEL_CREATE_INFO_VERSIONED(1, 10, ConvTranspose, kMSInternalNHWCDomain), + KERNEL_CREATE_INFO(11, ConvTranspose, kMSInternalNHWCDomain), + + KERNEL_CREATE_INFO_VERSIONED(8, 9, MaxPool, kMSInternalNHWCDomain), + KERNEL_CREATE_INFO_VERSIONED(10, 10, MaxPool, kMSInternalNHWCDomain), + KERNEL_CREATE_INFO_VERSIONED(11, 11, MaxPool, kMSInternalNHWCDomain), + KERNEL_CREATE_INFO(12, MaxPool, kMSInternalNHWCDomain), + + KERNEL_CREATE_INFO(1, QLinearConvTranspose, kMSInternalNHWCDomain), + + KERNEL_CREATE_INFO_VERSIONED(10, 10, Resize, kMSInternalNHWCDomain), + KERNEL_CREATE_INFO_VERSIONED(11, 12, Resize, kMSInternalNHWCDomain), + KERNEL_CREATE_INFO_VERSIONED(13, 17, Resize, kMSInternalNHWCDomain), + KERNEL_CREATE_INFO_VERSIONED(18, 18, Resize, kMSInternalNHWCDomain), + KERNEL_CREATE_INFO(19, Resize, kMSInternalNHWCDomain), + // layout insensitive, use ONNX-domain directly - BuildKernelCreateInfo< - ONNX_OPERATOR_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kOnnxDomain, 13, Softmax)>, - BuildKernelCreateInfo< - ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kOnnxDomain, 1, 12, Softmax)>, - BuildKernelCreateInfo< - ONNX_OPERATOR_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kOnnxDomain, 19, Resize)>, - BuildKernelCreateInfo< - ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kOnnxDomain, 18, 18, Resize)>, - BuildKernelCreateInfo< - ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kOnnxDomain, 13, 17, Resize)>, - BuildKernelCreateInfo< - ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kOnnxDomain, 11, 12, Resize)>, - BuildKernelCreateInfo< - ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kOnnxDomain, 10, 10, Resize)>, - BuildKernelCreateInfo< - ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kOnnxDomain, 7, 12, Gemm)>, - BuildKernelCreateInfo< - ONNX_OPERATOR_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kOnnxDomain, 13, Gemm)>, - BuildKernelCreateInfo< - ONNX_OPERATOR_VERSIONED_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kOnnxDomain, 1, 12, MatMul)>, - BuildKernelCreateInfo< - ONNX_OPERATOR_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kOnnxDomain, 13, MatMul)>, + KERNEL_CREATE_INFO_VERSIONED(1, 10, Softmax, kOnnxDomain), + KERNEL_CREATE_INFO_VERSIONED(11, 12, Softmax, kOnnxDomain), + KERNEL_CREATE_INFO(13, Softmax, kOnnxDomain), + + KERNEL_CREATE_INFO_VERSIONED(7, 8, Gemm, kOnnxDomain), + KERNEL_CREATE_INFO_VERSIONED(9, 10, Gemm, kOnnxDomain), + KERNEL_CREATE_INFO_VERSIONED(11, 12, Gemm, kOnnxDomain), + KERNEL_CREATE_INFO(13, Gemm, kOnnxDomain), + + KERNEL_CREATE_INFO_VERSIONED(1, 8, MatMul, kOnnxDomain), + KERNEL_CREATE_INFO_VERSIONED(9, 12, MatMul, kOnnxDomain), + KERNEL_CREATE_INFO(13, MatMul, kOnnxDomain), // quantization op - KERNEL_CREATE_INFO_TYPED(10, uint8_t, QLinearConv), - KERNEL_CREATE_INFO_TYPED(10, int8_t, QLinearConv), - KERNEL_CREATE_INFO(1, QLinearAveragePool), - BuildKernelCreateInfo< - ONNX_OPERATOR_KERNEL_CLASS_NAME(kXnnpackExecutionProvider, kDynamicDomainByCreate, 1, QLinearSoftmax)>, + KERNEL_CREATE_INFO(1, QLinearAveragePool, kMSInternalNHWCDomain), + + KERNEL_CREATE_INFO_TYPED(10, uint8_t, QLinearConv, kMSInternalNHWCDomain), + KERNEL_CREATE_INFO_TYPED(10, int8_t, QLinearConv, kMSInternalNHWCDomain), + + KERNEL_CREATE_INFO(1, QLinearSoftmax, kDynamicDomainByCreate), }; for (auto& function_table_entry : function_table) { diff --git a/onnxruntime/core/providers/xnnpack/xnnpack_init.cc b/onnxruntime/core/providers/xnnpack/xnnpack_init.cc index 27634a8b7090c..c3aa1d987c194 100644 --- a/onnxruntime/core/providers/xnnpack/xnnpack_init.cc +++ b/onnxruntime/core/providers/xnnpack/xnnpack_init.cc @@ -26,13 +26,15 @@ void xnn_deallocate(void* context, void* pointer) { } void* xnn_aligned_allocate(void* context, size_t alignment, size_t size) { + if (size == 0) + return nullptr; + #if defined(__wasm__) && !defined(__wasm_relaxed_simd__) && !defined(__wasm_simd128__) ORT_ENFORCE(alignment <= 2 * sizeof(void*)); return xnn_allocate(context, size); #else void* ptr = xnn_allocate(context, size); - ORT_ENFORCE((int64_t(ptr) & (alignment - 1)) == 0, - " xnnpack wants to allocate a space with ", alignment, "bytes aligned. But it's not satisfied"); + ORT_ENFORCE((int64_t(ptr) & (alignment - 1)) == 0, "xnnpack allocation was not aligned to ", alignment, " bytes."); // if ptr is not aligned, we have to find a way to return a aligned ptr and store the original ptr return ptr; #endif diff --git a/onnxruntime/core/providers/xnnpack/xnnpack_init.h b/onnxruntime/core/providers/xnnpack/xnnpack_init.h index d309edd0c3a4e..a1e64bf6046b2 100644 --- a/onnxruntime/core/providers/xnnpack/xnnpack_init.h +++ b/onnxruntime/core/providers/xnnpack/xnnpack_init.h @@ -5,6 +5,47 @@ struct xnn_allocator; namespace onnxruntime { namespace xnnpack { +// copy #define logic from XNNPACK src/xnnpack/common.h to determine workspace alignment +#if defined(__APPLE__) +#include +#endif + +#if defined(__i386__) || defined(__i486__) || defined(__i586__) || defined(__i686__) || defined(_M_IX86) +#define XNN_ARCH_X86 1 +#else +#define XNN_ARCH_X86 0 +#endif + +#if defined(__x86_64__) || defined(__x86_64) || defined(_M_X64) && !defined(_M_ARM64EC) +#define XNN_ARCH_X86_64 1 +#else +#define XNN_ARCH_X86_64 0 +#endif + +#if defined(__wasm__) && !defined(__wasm_relaxed_simd__) && !defined(__wasm_simd128__) +#define XNN_ARCH_WASM 1 +#else +#define XNN_ARCH_WASM 0 +#endif + +#if defined(__ANDROID__) || (defined(__APPLE__) && TARGET_OS_IPHONE) +#define XNN_PLATFORM_MOBILE 1 +#else +#define XNN_PLATFORM_MOBILE 0 +#endif + +#if XNN_ARCH_WASM +#define XNN_ALLOCATION_ALIGNMENT 4 +#elif XNN_ARCH_X86 || XNN_ARCH_X86_64 +#if XNN_PLATFORM_MOBILE +#define XNN_ALLOCATION_ALIGNMENT 32 +#else +#define XNN_ALLOCATION_ALIGNMENT 64 +#endif +#else +#define XNN_ALLOCATION_ALIGNMENT 16 +#endif + std::pair GetStoredAllocator(); } // namespace xnnpack diff --git a/onnxruntime/core/providers/xnnpack/xnnpack_kernel.h b/onnxruntime/core/providers/xnnpack/xnnpack_kernel.h index ada39c767f7c6..0978a88288114 100644 --- a/onnxruntime/core/providers/xnnpack/xnnpack_kernel.h +++ b/onnxruntime/core/providers/xnnpack/xnnpack_kernel.h @@ -4,6 +4,7 @@ #pragma once #include "core/framework/op_kernel.h" #include "core/providers/xnnpack/xnnpack_execution_provider.h" +#include "xnnpack.h" struct pthreadpool; @@ -12,18 +13,59 @@ namespace xnnpack { class XnnpackKernel : public OpKernel { public: - explicit XnnpackKernel(const OpKernelInfo& info) - : OpKernel(info), - xnnpack_threadpool_( - static_cast(info.GetExecutionProvider()) - ->GetPrivateThreadPool()) { + explicit XnnpackKernel(const OpKernelInfo& info, bool enable_caches = false) + : OpKernel{info}, + xnnpack_threadpool_{ + static_cast(info.GetExecutionProvider())->GetPrivateThreadPool()}, + caches_{enable_caches} { } [[nodiscard]] pthreadpool* GetThreadPool() const { return xnnpack_threadpool_; } + // see comment below about enabling code cache + // xnn_code_cache_t GetCodeCache() { return caches_.auto_code_cache.get();} + xnn_code_cache_t GetCodeCache() { return nullptr; } + xnn_weights_cache_t GetWeightsCache() { return caches_.auto_weights_cache.get(); } + private: pthreadpool* xnnpack_threadpool_; + + // Helper class to wrap usage of the XNNPACK weights and code caches. + // NOTE: Currently creating/freeing the code cache is not exposed via the public xnnpack.h header so usage is + // commented out. If we need to use it, we'll need to add the 'src' directory of XNNPACK to the include path + // and #include "xnnpack/cache.h" + struct Caches { + Caches(bool enable) + : // auto_code_cache(nullptr, xnn_release_code_cache), + auto_weights_cache(nullptr, xnn_delete_weights_cache) { + if (enable) { +#ifdef XNN_CACHE_ENABLE + xnn_status status = xnn_status_success; +#if XNN_PLATFORM_JIT + // status = xnn_init_code_cache(&code_cache_); + // ORT_ENFORCE(status == xnn_status_success, "Failed to initialize XNNPACK code cache");) + // auto_code_cache.reset(&code_cache_); +#endif + // status = xnn_init_weights_cache(&weights_cache_); + xnn_weights_cache_t weights_cache = nullptr; + status = xnn_create_weights_cache(&weights_cache, 0); + ORT_ENFORCE(status == xnn_status_success, "Failed to create XNNPACK weights cache"); + auto_weights_cache.reset(weights_cache); +#endif + } + } + + // std::unique_ptr auto_code_cache; + std::unique_ptr auto_weights_cache; + + // private: + // #if defined(XNN_CACHE_ENABLE) && XNN_PLATFORM_JIT + // xnn_code_cache code_cache_; + // #endif + }; + + Caches caches_; }; } // namespace xnnpack } // namespace onnxruntime diff --git a/onnxruntime/python/onnxruntime_pybind_iobinding.cc b/onnxruntime/python/onnxruntime_pybind_iobinding.cc index 7638a12bb820c..59d5a77bfbea3 100644 --- a/onnxruntime/python/onnxruntime_pybind_iobinding.cc +++ b/onnxruntime/python/onnxruntime_pybind_iobinding.cc @@ -60,8 +60,6 @@ void addIoBindingMethods(pybind11::module& m) { }) // This binds input as a Tensor that wraps memory pointer along with the OrtMemoryInfo .def("bind_input", [](SessionIOBinding* io_binding, const std::string& name, const OrtDevice& device, py::object& element_type, const std::vector& shape, int64_t data_ptr) -> void { - ORT_ENFORCE(data_ptr != 0, "Pointer to data memory is not valid"); - PyArray_Descr* dtype; if (!PyArray_DescrConverter(element_type.ptr(), &dtype)) { throw std::runtime_error("Not a valid numpy type"); diff --git a/onnxruntime/python/onnxruntime_pybind_quant.cc b/onnxruntime/python/onnxruntime_pybind_quant.cc index 04dfa9b51e112..ff76887e917cd 100644 --- a/onnxruntime/python/onnxruntime_pybind_quant.cc +++ b/onnxruntime/python/onnxruntime_pybind_quant.cc @@ -5,7 +5,7 @@ #include #include -#include "contrib_ops/cpu/quantization/dequantize_blockwise.h" +#include "core/mlas/inc/mlas_q4.h" #include "contrib_ops/cpu/quantization/dequantize_blockwise_bnb4.h" #include "core/util/thread_utils.h" @@ -53,15 +53,16 @@ void QuantizeMatMul4BitsBlockwise( py::buffer_info scale_buf = scale.request(); py::buffer_info zp_buf = zero_points.request(); - contrib::QuantizeBlockwise( - static_cast(dst_buf.ptr), - static_cast(src_buf.ptr), - static_cast(scale_buf.ptr), - is_symmetric ? nullptr : static_cast(zp_buf.ptr), + MlasQuantizeBlockwise( + reinterpret_cast(dst_buf.ptr), + reinterpret_cast(scale_buf.ptr), + is_symmetric ? nullptr : reinterpret_cast(zp_buf.ptr), + reinterpret_cast(src_buf.ptr), block_size, - 4, - N, + true, K, + N, + N, tp.get()); } diff --git a/onnxruntime/python/tools/kernel_explorer/kernels/matmul_4bits.py b/onnxruntime/python/tools/kernel_explorer/kernels/matmul_4bits.py index 9cb937a13ff27..111e156cd6d01 100644 --- a/onnxruntime/python/tools/kernel_explorer/kernels/matmul_4bits.py +++ b/onnxruntime/python/tools/kernel_explorer/kernels/matmul_4bits.py @@ -56,7 +56,7 @@ def profile_matmul_fp_int4_func(m, n, k, dtype, func, is_symmetric): a = np.random.rand(m, k).astype(dtype) b = np.random.randint(low=0, high=127, size=(n, (k + 31) // 32, 16)).astype("uint8") scales = np.random.rand(n * ((k + 31) // 32)).astype(dtype) - zeropoints = np.random.rand((n * ((k + 31) // 32) + 1) // 2).astype(dtype) + zeropoints = np.random.rand(n * (((k + 31) // 32 + 1) // 2)).astype(dtype) output_d = ke.DeviceArray(output) a_d = ke.DeviceArray(a) diff --git a/onnxruntime/python/tools/quantization/matmul_4bits_quantizer.py b/onnxruntime/python/tools/quantization/matmul_4bits_quantizer.py index fea9e5e8cb739..1c3c212b54fa4 100644 --- a/onnxruntime/python/tools/quantization/matmul_4bits_quantizer.py +++ b/onnxruntime/python/tools/quantization/matmul_4bits_quantizer.py @@ -61,7 +61,7 @@ def int4_block_quant(self, fp32weight: npt.ArrayLike) -> np.ndarray: # block wise quantization, each block comes from a single column packed = np.zeros((cols, k_blocks, blob_size), dtype="uint8") scales = np.zeros((cols * k_blocks), dtype=fp32weight.dtype) - zero_point = np.zeros((cols * k_blocks + 1) // 2, dtype="uint8") + zero_point = np.zeros(cols * ((k_blocks + 1) // 2), dtype="uint8") quantize_matmul_4bits(packed, fp32weight, scales, zero_point, block_size, cols, rows, self.is_symmetric) return (packed, scales, zero_point) diff --git a/onnxruntime/test/contrib_ops/matmul_4bits_test.cc b/onnxruntime/test/contrib_ops/matmul_4bits_test.cc index dc8efbbaf3709..918ee0e6eb976 100644 --- a/onnxruntime/test/contrib_ops/matmul_4bits_test.cc +++ b/onnxruntime/test/contrib_ops/matmul_4bits_test.cc @@ -14,7 +14,6 @@ #include "test/providers/provider_test_utils.h" #include "test/util/include/default_providers.h" #include "core/util/qmath.h" -#include "contrib_ops/cpu/quantization/dequantize_blockwise.h" #include #include @@ -25,6 +24,8 @@ namespace onnxruntime { namespace test { +static constexpr int QBits = 4; + void QuantizeDequantize(std::vector& raw_vals, std::vector& quant_vals, std::vector& scales, @@ -35,27 +36,29 @@ void QuantizeDequantize(std::vector& raw_vals, OrtThreadPoolParams to; auto tp = concurrency::CreateThreadPool(&onnxruntime::Env::Default(), to, concurrency::ThreadPoolType::INTRA_OP); - contrib::QuantizeBlockwise( + + MlasQuantizeBlockwise( quant_vals.data(), - raw_vals.data(), scales.data(), zp != nullptr ? zp->data() : nullptr, + raw_vals.data(), block_size, - 4, - N, + true, K, + N, + N, tp.get()); // Note that input1_f_vals is NxK after dequant - contrib::DequantizeBlockwise( - raw_vals.data(), - quant_vals.data(), - scales.data(), - zp != nullptr ? zp->data() : nullptr, - block_size, - 4, - N, - K, + MlasDequantizeBlockwise( + raw_vals.data(), // dequantized output + quant_vals.data(), // quantized input + scales.data(), // quantization scales + zp != nullptr ? zp->data() : nullptr, // quantization zero points + block_size, // quantization block size + true, // columnwise quantization + K, // number of rows + N, // number of columns tp.get()); } @@ -69,13 +72,21 @@ void RunTest(int64_t M, int64_t N, int64_t K, int64_t block_size, bool has_zerop MlasTranspose(input1_f_vals.data(), input1_f_vals_trans.data(), K, N); #endif - int64_t block_per_k = (K + block_size - 1) / block_size; - int64_t number_of_block = block_per_k * N; - int64_t block_blob_size = block_size * 4 / 8; - int64_t buf_size = number_of_block * (block_size * 4 / 8); - std::vector input1_vals(buf_size); - std::vector scales(number_of_block); - std::vector zp((N * block_per_k + 1) / 2); + int meta_rows; + int meta_cols; + MlasBlockwiseQuantMetaShape((int)block_size, true, (int)K, (int)N, meta_rows, meta_cols); + + int q_rows; + int q_cols; + MlasBlockwiseQuantizedShape((int)block_size, true, (int)K, (int)N, q_rows, q_cols); + + std::vector input1_vals(q_rows * q_cols); + std::vector scales(meta_rows * meta_cols); + + // TODO!! THIS SHOULD BE PROVIDED BY MLAS + // sub 8b packing always happen on the column dimension + const int packed_meta_rows = (meta_rows * QBits + 7) / 8; + std::vector zp(packed_meta_rows * meta_cols); QuantizeDequantize(input1_f_vals, input1_vals, @@ -100,13 +111,13 @@ void RunTest(int64_t M, int64_t N, int64_t K, int64_t block_size, bool has_zerop test.AddAttribute("K", K); test.AddAttribute("N", N); test.AddAttribute("block_size", block_size); - test.AddAttribute("bits", 4); + test.AddAttribute("bits", QBits); if (use_float16) { test.AddInput("A", {M, K}, ToFloat16(input0_vals), false); - test.AddInput("B", {N, block_per_k, block_blob_size}, input1_vals, true); - test.AddInput("scales", {N * block_per_k}, ToFloat16(scales), true); + test.AddInput("B", {q_cols, q_rows}, input1_vals, true); + test.AddInput("scales", {meta_cols * meta_rows}, ToFloat16(scales), true); if (has_zeropoint) { - test.AddInput("zero_points", {(N * block_per_k + 1) / 2}, zp, true); + test.AddInput("zero_points", {meta_cols * packed_meta_rows}, zp, true); } test.AddOutput("Y", {M, N}, ToFloat16(expected_vals)); @@ -117,10 +128,10 @@ void RunTest(int64_t M, int64_t N, int64_t K, int64_t block_size, bool has_zerop test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers); } else { test.AddInput("A", {M, K}, input0_vals, false); - test.AddInput("B", {N, block_per_k, block_blob_size}, input1_vals, true); - test.AddInput("scales", {N * block_per_k}, scales, true); + test.AddInput("B", {q_cols, q_rows}, input1_vals, true); + test.AddInput("scales", {meta_cols * meta_rows}, scales, true); if (has_zeropoint) { - test.AddInput("zero_points", {(N * block_per_k + 1) / 2}, zp, true); + test.AddInput("zero_points", {meta_cols * packed_meta_rows}, zp, true); } test.AddOutput("Y", {M, N}, expected_vals); diff --git a/onnxruntime/test/mlas/unittest/test_activation.cpp b/onnxruntime/test/mlas/unittest/test_activation.cpp index eb3e35d739bb3..2bb0bbcd35e26 100644 --- a/onnxruntime/test/mlas/unittest/test_activation.cpp +++ b/onnxruntime/test/mlas/unittest/test_activation.cpp @@ -256,9 +256,6 @@ class MlasActivationTest : public MlasTestBase { } }; -template <> -MlasActivationTest* MlasTestFixture::mlas_tester(nullptr); - static UNUSED_VARIABLE bool added_to_main = AddTestRegister([](bool is_short_execute) { return is_short_execute ? MlasDirectShortExecuteTests::RegisterShortExecute() : 0; }); diff --git a/onnxruntime/test/mlas/unittest/test_blkq8.cpp b/onnxruntime/test/mlas/unittest/test_blkq8.cpp index 15bbd1b4cb28d..5cff86d411ca9 100644 --- a/onnxruntime/test/mlas/unittest/test_blkq8.cpp +++ b/onnxruntime/test/mlas/unittest/test_blkq8.cpp @@ -150,12 +150,6 @@ class MlasBlkQ8ShortExeTest : public MlasTestFixture> { size_t M_, K_; }; -template <> -MlasBlkQ8Test* MlasTestFixture>::mlas_tester(nullptr); - -template <> -MlasBlkQ8Test* MlasTestFixture>::mlas_tester(nullptr); - static size_t BlkQ8ReisterShortTests() { size_t cnt = 0; cnt += MlasBlkQ8ShortExeTest::RegisterShortExecuteTests(); diff --git a/onnxruntime/test/mlas/unittest/test_blockq4.cpp b/onnxruntime/test/mlas/unittest/test_blockq4.cpp index 6f06e0f2eead8..f836da8277bb8 100644 --- a/onnxruntime/test/mlas/unittest/test_blockq4.cpp +++ b/onnxruntime/test/mlas/unittest/test_blockq4.cpp @@ -96,7 +96,8 @@ class MlasBlockwiseQdqTest : public MlasTestBase { } } - MlasDequantizeBlockwise(dequant_buf, elements, scales, zp, block_size, columnwise, rows, columns, threadpool_ptr); + MlasDequantizeBlockwise(dequant_buf, elements, scales, zp, block_size, + columnwise, rows, columns, threadpool_ptr); MlasTranspose(dequant_buf, transposed, columns, rows); @@ -104,7 +105,8 @@ class MlasBlockwiseQdqTest : public MlasTestBase { float* o_scales = OutputScales.GetBuffer(meta_rows * meta_cols); uint8_t* o_zp = symmetric ? nullptr : OutputOffsets.GetBuffer(((meta_rows + 1) / 2) * meta_cols, true); - MlasQuantizeBlockwise(o_elements, o_scales, o_zp, transposed, block_size, columnwise, rows, columns, columns, threadpool_ptr); + MlasQuantizeBlockwise(o_elements, o_scales, o_zp, transposed, block_size, + columnwise, rows, columns, columns, threadpool_ptr); for (int c = 0; c < columns; c++) { for (int r = 0; r < rows; r += 2) { @@ -194,9 +196,6 @@ class MlasBlockwiseQdqTest : public MlasTestBase { MlasBlockwiseQdqTest() = default; }; -template <> -MlasBlockwiseQdqTest* MlasTestFixture::mlas_tester(nullptr); - static UNUSED_VARIABLE bool added_to_main = AddTestRegister([](bool is_short_execute) { size_t count = 0; if (is_short_execute) { diff --git a/onnxruntime/test/mlas/unittest/test_conv2d.cpp b/onnxruntime/test/mlas/unittest/test_conv2d.cpp index 97560bbfc2e7e..1700cd8f1800f 100644 --- a/onnxruntime/test/mlas/unittest/test_conv2d.cpp +++ b/onnxruntime/test/mlas/unittest/test_conv2d.cpp @@ -4,11 +4,6 @@ #include "test_conv2d.h" #include "test_conv2d_fixture.h" -template <> -MlasConv2DTest* MlasTestFixture>::mlas_tester(nullptr); -template <> -MlasConv2DTest* MlasTestFixture>::mlas_tester(nullptr); - static size_t Conv2dRegistLongExecute() { size_t count = MlasLongExecuteTests>::RegisterLongExecute(); if (GetMlasThreadPool() != nullptr) { diff --git a/onnxruntime/test/mlas/unittest/test_conv2d_nchwc.cpp b/onnxruntime/test/mlas/unittest/test_conv2d_nchwc.cpp index 78a047e385b99..e5a536eb9e4f0 100644 --- a/onnxruntime/test/mlas/unittest/test_conv2d_nchwc.cpp +++ b/onnxruntime/test/mlas/unittest/test_conv2d_nchwc.cpp @@ -4,11 +4,6 @@ #include "test_conv2d_nchwc.h" #include "test_conv2d_fixture.h" -template <> -MlasNchwcConv2DTest* MlasTestFixture>::mlas_tester(nullptr); -template <> -MlasNchwcConv2DTest* MlasTestFixture>::mlas_tester(nullptr); - static size_t Conv2dNchwcRegistLongExecute() { size_t count = 0; diff --git a/onnxruntime/test/mlas/unittest/test_exp.cpp b/onnxruntime/test/mlas/unittest/test_exp.cpp index ce8c4e97748f8..f9cdffef1947d 100644 --- a/onnxruntime/test/mlas/unittest/test_exp.cpp +++ b/onnxruntime/test/mlas/unittest/test_exp.cpp @@ -50,9 +50,6 @@ class MlasComputeExpTest : public MlasTestBase { } }; -template <> -MlasComputeExpTest* MlasTestFixture::mlas_tester(nullptr); - static UNUSED_VARIABLE bool added_to_main = AddTestRegister([](bool is_short_execute) { // no long execute needed return is_short_execute ? MlasDirectShortExecuteTests::RegisterShortExecute() : 0; diff --git a/onnxruntime/test/mlas/unittest/test_fgemm.cpp b/onnxruntime/test/mlas/unittest/test_fgemm.cpp index 6b8d4529faadb..e3f50baf3633d 100644 --- a/onnxruntime/test/mlas/unittest/test_fgemm.cpp +++ b/onnxruntime/test/mlas/unittest/test_fgemm.cpp @@ -7,24 +7,6 @@ #include #include -template <> -MlasFgemmTest* MlasTestFixture>::mlas_tester(nullptr); -template <> -MlasFgemmTest* MlasTestFixture>::mlas_tester(nullptr); -template <> -MlasFgemmTest* MlasTestFixture>::mlas_tester(nullptr); -template <> -MlasFgemmTest* MlasTestFixture>::mlas_tester(nullptr); - -#ifdef MLAS_SUPPORTS_GEMM_DOUBLE - -template <> -MlasFgemmTest* MlasTestFixture>::mlas_tester(nullptr); -template <> -MlasFgemmTest* MlasTestFixture>::mlas_tester(nullptr); - -#endif - static size_t FGemmRegistLongExecute() { size_t count = 0; diff --git a/onnxruntime/test/mlas/unittest/test_fp16_activation.cpp b/onnxruntime/test/mlas/unittest/test_fp16_activation.cpp index a9e062e0b6534..484a9a22429d5 100644 --- a/onnxruntime/test/mlas/unittest/test_fp16_activation.cpp +++ b/onnxruntime/test/mlas/unittest/test_fp16_activation.cpp @@ -148,9 +148,6 @@ class MlasFp16ActivationTest : public MlasTestBase { } }; -template <> -MlasFp16ActivationTest* MlasTestFixture::mlas_tester(nullptr); - static UNUSED_VARIABLE bool added_to_main = AddTestRegister([](bool is_short_execute) { return is_short_execute ? MlasDirectShortExecuteTests::RegisterShortExecute() : 0; }); diff --git a/onnxruntime/test/mlas/unittest/test_halfgemm.cpp b/onnxruntime/test/mlas/unittest/test_halfgemm.cpp index 1a307d339b0f2..2a478675d09eb 100644 --- a/onnxruntime/test/mlas/unittest/test_halfgemm.cpp +++ b/onnxruntime/test/mlas/unittest/test_halfgemm.cpp @@ -89,42 +89,6 @@ class HalfGemmShortExecuteTest : public MlasTestFixture -MlasHalfGemmTest* MlasTestFixture>::mlas_tester(nullptr); -template <> -MlasHalfGemmTest* MlasTestFixture>::mlas_tester(nullptr); -template <> -MlasHalfGemmTest* MlasTestFixture>::mlas_tester(nullptr); -template <> -MlasHalfGemmTest* MlasTestFixture>::mlas_tester(nullptr); - -template <> -MlasHalfGemmTest* MlasTestFixture>::mlas_tester(nullptr); -template <> -MlasHalfGemmTest* MlasTestFixture>::mlas_tester(nullptr); -template <> -MlasHalfGemmTest* MlasTestFixture>::mlas_tester(nullptr); -template <> -MlasHalfGemmTest* MlasTestFixture>::mlas_tester(nullptr); - -template <> -MlasHalfGemmTest* MlasTestFixture>::mlas_tester(nullptr); -template <> -MlasHalfGemmTest* MlasTestFixture>::mlas_tester(nullptr); -template <> -MlasHalfGemmTest* MlasTestFixture>::mlas_tester(nullptr); -template <> -MlasHalfGemmTest* MlasTestFixture>::mlas_tester(nullptr); - -template <> -MlasHalfGemmTest* MlasTestFixture>::mlas_tester(nullptr); -template <> -MlasHalfGemmTest* MlasTestFixture>::mlas_tester(nullptr); -template <> -MlasHalfGemmTest* MlasTestFixture>::mlas_tester(nullptr); -template <> -MlasHalfGemmTest* MlasTestFixture>::mlas_tester(nullptr); - static size_t HalfGemmRegistLongExecute() { size_t count = 0; diff --git a/onnxruntime/test/mlas/unittest/test_minmax.cpp b/onnxruntime/test/mlas/unittest/test_minmax.cpp index f0df504720c0c..245879deccffd 100644 --- a/onnxruntime/test/mlas/unittest/test_minmax.cpp +++ b/onnxruntime/test/mlas/unittest/test_minmax.cpp @@ -46,9 +46,6 @@ class MlasFindMinMaxElementsTest : public MlasTestBase { } }; -template <> -MlasFindMinMaxElementsTest* MlasTestFixture::mlas_tester(nullptr); - #ifdef __GNUC__ #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wunused-parameter" diff --git a/onnxruntime/test/mlas/unittest/test_pool2d.cpp b/onnxruntime/test/mlas/unittest/test_pool2d.cpp index 012e7f25fddce..8cefb8332ec32 100644 --- a/onnxruntime/test/mlas/unittest/test_pool2d.cpp +++ b/onnxruntime/test/mlas/unittest/test_pool2d.cpp @@ -4,20 +4,6 @@ #include "test_pool2d.h" #include "test_pool2d_fixture.h" -template <> -MlasPool2DTest* MlasTestFixture>::mlas_tester(nullptr); -template <> -MlasPool2DTest* MlasTestFixture>::mlas_tester(nullptr); -template <> -MlasPool2DTest* MlasTestFixture>::mlas_tester(nullptr); - -template <> -MlasPool2DTest* MlasTestFixture>::mlas_tester(nullptr); -template <> -MlasPool2DTest* MlasTestFixture>::mlas_tester(nullptr); -template <> -MlasPool2DTest* MlasTestFixture>::mlas_tester(nullptr); - static size_t Pool2dRegistLongExecute() { size_t count = 0; count += MlasLongExecuteTests>::RegisterLongExecute(); diff --git a/onnxruntime/test/mlas/unittest/test_pool2d_nchwc.cpp b/onnxruntime/test/mlas/unittest/test_pool2d_nchwc.cpp index 190fbe7d5a6f1..bee690b10b737 100644 --- a/onnxruntime/test/mlas/unittest/test_pool2d_nchwc.cpp +++ b/onnxruntime/test/mlas/unittest/test_pool2d_nchwc.cpp @@ -4,20 +4,6 @@ #include "test_pool2d_nchwc.h" #include "test_pool2d_fixture.h" -template <> -MlasNchwcPool2DTest* MlasTestFixture>::mlas_tester(nullptr); -template <> -MlasNchwcPool2DTest* MlasTestFixture>::mlas_tester(nullptr); -template <> -MlasNchwcPool2DTest* MlasTestFixture>::mlas_tester(nullptr); - -template <> -MlasNchwcPool2DTest* MlasTestFixture>::mlas_tester(nullptr); -template <> -MlasNchwcPool2DTest* MlasTestFixture>::mlas_tester(nullptr); -template <> -MlasNchwcPool2DTest* MlasTestFixture>::mlas_tester(nullptr); - static size_t Pool2dNchwcRegistLongExecute() { size_t count = 0; if (MlasNchwcGetBlockSize() > 1) { diff --git a/onnxruntime/test/mlas/unittest/test_pool3d.cpp b/onnxruntime/test/mlas/unittest/test_pool3d.cpp index a93698234f7da..e0ce4c240be80 100644 --- a/onnxruntime/test/mlas/unittest/test_pool3d.cpp +++ b/onnxruntime/test/mlas/unittest/test_pool3d.cpp @@ -4,20 +4,6 @@ #include "test_pool3d.h" #include "test_pool3d_fixture.h" -template <> -MlasPool3DTest* MlasTestFixture>::mlas_tester(nullptr); -template <> -MlasPool3DTest* MlasTestFixture>::mlas_tester(nullptr); -template <> -MlasPool3DTest* MlasTestFixture>::mlas_tester(nullptr); - -template <> -MlasPool3DTest* MlasTestFixture>::mlas_tester(nullptr); -template <> -MlasPool3DTest* MlasTestFixture>::mlas_tester(nullptr); -template <> -MlasPool3DTest* MlasTestFixture>::mlas_tester(nullptr); - static size_t Pool3dRegistLongExecute() { size_t count = 0; count += MlasLongExecuteTests>::RegisterLongExecute(); diff --git a/onnxruntime/test/mlas/unittest/test_q4gemm.cpp b/onnxruntime/test/mlas/unittest/test_q4gemm.cpp index 2c3bf23a9330b..dccd7d00b6d3f 100644 --- a/onnxruntime/test/mlas/unittest/test_q4gemm.cpp +++ b/onnxruntime/test/mlas/unittest/test_q4gemm.cpp @@ -83,19 +83,6 @@ class Q4GemmShortExecuteTest : public MlasTestFixture -MlasQ4GemmTest* MlasTestFixture>::mlas_tester(nullptr); -template <> -MlasQ4GemmTest* MlasTestFixture>::mlas_tester(nullptr); -template <> -MlasQ4GemmTest* MlasTestFixture>::mlas_tester(nullptr); -template <> -MlasQ4GemmTest* MlasTestFixture>::mlas_tester(nullptr); -template <> -MlasQ4GemmTest* MlasTestFixture>::mlas_tester(nullptr); -template <> -MlasQ4GemmTest* MlasTestFixture>::mlas_tester(nullptr); - static size_t Q4GemmRegistShortExecute() { size_t count = 0; diff --git a/onnxruntime/test/mlas/unittest/test_q4qdq.cpp b/onnxruntime/test/mlas/unittest/test_q4qdq.cpp index 8215c63a2cc56..955c3b1201989 100644 --- a/onnxruntime/test/mlas/unittest/test_q4qdq.cpp +++ b/onnxruntime/test/mlas/unittest/test_q4qdq.cpp @@ -141,9 +141,6 @@ class MlasQ4dqTest : public MlasTestBase { MlasQ4dqTest() = default; }; -template <> -MlasQ4dqTest* MlasTestFixture::mlas_tester(nullptr); - static UNUSED_VARIABLE bool added_to_main = AddTestRegister([](bool is_short_execute) { if (MlasQ4GemmPackBSize(BlkQ4Sym, 32, 32) == 0) { return (size_t)0; diff --git a/onnxruntime/test/mlas/unittest/test_q8q4gemm.cpp b/onnxruntime/test/mlas/unittest/test_q8q4gemm.cpp index bac16b0103a6e..a78a3261d1f2a 100644 --- a/onnxruntime/test/mlas/unittest/test_q8q4gemm.cpp +++ b/onnxruntime/test/mlas/unittest/test_q8q4gemm.cpp @@ -271,19 +271,6 @@ class Q8Q4GemmShortExecuteTest : public MlasTestFixture -MlasQ8Q4GemmTest* MlasTestFixture>::mlas_tester(nullptr); -template <> -MlasQ8Q4GemmTest* MlasTestFixture>::mlas_tester(nullptr); -template <> -MlasQ8Q4GemmTest* MlasTestFixture>::mlas_tester(nullptr); -template <> -MlasQ8Q4GemmTest* MlasTestFixture>::mlas_tester(nullptr); -template <> -MlasQ8Q4GemmTest* MlasTestFixture>::mlas_tester(nullptr); -template <> -MlasQ8Q4GemmTest* MlasTestFixture>::mlas_tester(nullptr); - static size_t Q8Q4GemmRegistShortExecute() { size_t count = 0; diff --git a/onnxruntime/test/mlas/unittest/test_qgemm.cpp b/onnxruntime/test/mlas/unittest/test_qgemm.cpp index a55331f1377fa..6bb93d35357f8 100644 --- a/onnxruntime/test/mlas/unittest/test_qgemm.cpp +++ b/onnxruntime/test/mlas/unittest/test_qgemm.cpp @@ -1,60 +1,6 @@ #include "test_qgemm.h" #include "test_qgemm_fixture.h" -template <> -MlasQgemmTest* MlasTestFixture>::mlas_tester(nullptr); -template <> -MlasQgemmTest* MlasTestFixture>::mlas_tester(nullptr); -template <> -MlasQgemmTest* MlasTestFixture>::mlas_tester(nullptr); -template <> -MlasQgemmTest* MlasTestFixture>::mlas_tester(nullptr); - -template <> -MlasQgemmTest* MlasTestFixture>::mlas_tester(nullptr); -template <> -MlasQgemmTest* MlasTestFixture>::mlas_tester(nullptr); -template <> -MlasQgemmTest* MlasTestFixture>::mlas_tester(nullptr); -template <> -MlasQgemmTest* MlasTestFixture>::mlas_tester(nullptr); - -template <> -MlasQgemmTest* MlasTestFixture>::mlas_tester(nullptr); -template <> -MlasQgemmTest* MlasTestFixture>::mlas_tester(nullptr); -template <> -MlasQgemmTest* MlasTestFixture>::mlas_tester(nullptr); -template <> -MlasQgemmTest* MlasTestFixture>::mlas_tester(nullptr); - -template <> -MlasQgemmTest* MlasTestFixture>::mlas_tester(nullptr); -template <> -MlasQgemmTest* MlasTestFixture>::mlas_tester(nullptr); -template <> -MlasQgemmTest* MlasTestFixture>::mlas_tester(nullptr); -template <> -MlasQgemmTest* MlasTestFixture>::mlas_tester(nullptr); - -template <> -MlasQgemmTest* MlasTestFixture>::mlas_tester(nullptr); -template <> -MlasQgemmTest* MlasTestFixture>::mlas_tester(nullptr); -template <> -MlasQgemmTest* MlasTestFixture>::mlas_tester(nullptr); -template <> -MlasQgemmTest* MlasTestFixture>::mlas_tester(nullptr); - -template <> -MlasQgemmTest* MlasTestFixture>::mlas_tester(nullptr); -template <> -MlasQgemmTest* MlasTestFixture>::mlas_tester(nullptr); -template <> -MlasQgemmTest* MlasTestFixture>::mlas_tester(nullptr); -template <> -MlasQgemmTest* MlasTestFixture>::mlas_tester(nullptr); - static size_t QGemmRegistLongExecute() { size_t count = 0; diff --git a/onnxruntime/test/mlas/unittest/test_qlinear_binaryop.cpp b/onnxruntime/test/mlas/unittest/test_qlinear_binaryop.cpp index 93dda4bee183b..5876f186eaa0d 100644 --- a/onnxruntime/test/mlas/unittest/test_qlinear_binaryop.cpp +++ b/onnxruntime/test/mlas/unittest/test_qlinear_binaryop.cpp @@ -163,11 +163,6 @@ class MlasQLinearMulTest : public MlasQLinearBinaryOpTest { } }; -template <> -MlasQLinearAddTest* MlasTestFixture::mlas_tester(nullptr); -template <> -MlasQLinearMulTest* MlasTestFixture::mlas_tester(nullptr); - static bool UNUSED_VARIABLE added_to_main = AddTestRegister([](bool is_short_execute) { size_t count = 0; if (is_short_execute) { diff --git a/onnxruntime/test/mlas/unittest/test_qlinear_gavgpool.cpp b/onnxruntime/test/mlas/unittest/test_qlinear_gavgpool.cpp index aeb13af5b941a..e6c230df57fbc 100644 --- a/onnxruntime/test/mlas/unittest/test_qlinear_gavgpool.cpp +++ b/onnxruntime/test/mlas/unittest/test_qlinear_gavgpool.cpp @@ -162,11 +162,6 @@ class MlasQLinearGlobalAveragePoolTest : public MlasTestBase { } }; -template <> -MlasQLinearGlobalAveragePoolTest* MlasTestFixture>::mlas_tester(nullptr); -template <> -MlasQLinearGlobalAveragePoolTest* MlasTestFixture>::mlas_tester(nullptr); - template <> const std::vector MlasQLinearGlobalAveragePoolTest::ZeroPoints = {-128, -110, 1, 103, 127}; diff --git a/onnxruntime/test/mlas/unittest/test_quantizelinear.cpp b/onnxruntime/test/mlas/unittest/test_quantizelinear.cpp index 2832598fef1a9..986d158d2b1b9 100644 --- a/onnxruntime/test/mlas/unittest/test_quantizelinear.cpp +++ b/onnxruntime/test/mlas/unittest/test_quantizelinear.cpp @@ -71,15 +71,6 @@ class MlasQuantizeLinearTest : public MlasTestBase { } }; -template <> -MlasQuantizeLinearTest* MlasTestFixture>::mlas_tester(nullptr); -template <> -MlasQuantizeLinearTest* MlasTestFixture>::mlas_tester(nullptr); -template <> -MlasQuantizeLinearTest* MlasTestFixture>::mlas_tester(nullptr); -template <> -MlasQuantizeLinearTest* MlasTestFixture>::mlas_tester(nullptr); - static UNUSED_VARIABLE bool added_to_main = AddTestRegister([](bool is_short_execute) { size_t count = 0; if (is_short_execute) { diff --git a/onnxruntime/test/mlas/unittest/test_reorder_output.cpp b/onnxruntime/test/mlas/unittest/test_reorder_output.cpp index 21373fe9f66e7..e39abd8578da4 100644 --- a/onnxruntime/test/mlas/unittest/test_reorder_output.cpp +++ b/onnxruntime/test/mlas/unittest/test_reorder_output.cpp @@ -88,9 +88,6 @@ class MlasReorderOutputTest : public MlasTestBase { } }; -template <> -MlasReorderOutputTest* MlasTestFixture::mlas_tester(nullptr); - static UNUSED_VARIABLE bool added_to_main = AddTestRegister([](bool is_short_execute) { return (MlasNchwcGetBlockSize() > 1 && is_short_execute) ? MlasDirectShortExecuteTests::RegisterShortExecute() diff --git a/onnxruntime/test/mlas/unittest/test_scaleoutput.cpp b/onnxruntime/test/mlas/unittest/test_scaleoutput.cpp index 7732b1fa8c72e..34f17843b0726 100644 --- a/onnxruntime/test/mlas/unittest/test_scaleoutput.cpp +++ b/onnxruntime/test/mlas/unittest/test_scaleoutput.cpp @@ -77,9 +77,6 @@ class MlasScaleOutputTest : public MlasTestBase { } }; -template <> -MlasScaleOutputTest* MlasTestFixture::mlas_tester(nullptr); - static UNUSED_VARIABLE bool added_to_main = AddTestRegister([](bool is_short_execute) { return is_short_execute ? MlasDirectShortExecuteTests::RegisterShortExecute() : 0; }); diff --git a/onnxruntime/test/mlas/unittest/test_softmax.cpp b/onnxruntime/test/mlas/unittest/test_softmax.cpp index 3df2b88f9652a..4c5e11bbe9566 100644 --- a/onnxruntime/test/mlas/unittest/test_softmax.cpp +++ b/onnxruntime/test/mlas/unittest/test_softmax.cpp @@ -97,11 +97,6 @@ class MlasSoftmaxTest : public MlasTestBase { } }; -template <> -MlasSoftmaxTest* MlasTestFixture>::mlas_tester(nullptr); -template <> -MlasSoftmaxTest* MlasTestFixture>::mlas_tester(nullptr); - static UNUSED_VARIABLE bool added_to_main = AddTestRegister([](bool is_short_execute) { size_t count = 0; if (is_short_execute) { diff --git a/onnxruntime/test/mlas/unittest/test_symm_qgemm.cpp b/onnxruntime/test/mlas/unittest/test_symm_qgemm.cpp index adfe5564ebbbf..bb3aea02cc011 100644 --- a/onnxruntime/test/mlas/unittest/test_symm_qgemm.cpp +++ b/onnxruntime/test/mlas/unittest/test_symm_qgemm.cpp @@ -1,10 +1,5 @@ #include "test_symm_qgemm_fixture.h" -template <> -MlasSymmQgemmTest* MlasTestFixture>::mlas_tester(nullptr); -template <> -MlasSymmQgemmTest* MlasTestFixture>::mlas_tester(nullptr); - static size_t SymmQgemmRegistLongExecute() { if (MlasSymmQgemmPackBSize(16, 16, true) == 0) { return 0; diff --git a/onnxruntime/test/mlas/unittest/test_transpose.cpp b/onnxruntime/test/mlas/unittest/test_transpose.cpp index 74ce5868f411d..8fa98411a21ab 100644 --- a/onnxruntime/test/mlas/unittest/test_transpose.cpp +++ b/onnxruntime/test/mlas/unittest/test_transpose.cpp @@ -45,13 +45,6 @@ class MlasTransposeTest : public MlasTestBase { } }; -template <> -MlasTransposeTest* MlasTestFixture>::mlas_tester(nullptr); -template <> -MlasTransposeTest* MlasTestFixture>::mlas_tester(nullptr); -template <> -MlasTransposeTest* MlasTestFixture>::mlas_tester(nullptr); - static UNUSED_VARIABLE bool added_to_main = AddTestRegister([](bool is_short_execute) { size_t count = 0; if (is_short_execute) { diff --git a/onnxruntime/test/mlas/unittest/test_util.h b/onnxruntime/test/mlas/unittest/test_util.h index c5ee8b4b6115a..db528ef7291cc 100644 --- a/onnxruntime/test/mlas/unittest/test_util.h +++ b/onnxruntime/test/mlas/unittest/test_util.h @@ -188,8 +188,7 @@ class MlasTestFixture : public testing::Test { mlas_tester = nullptr; }; - // Do not forgot to define this static member element when upon usage. - static TMlasTester* mlas_tester; + static inline TMlasTester* mlas_tester = nullptr; }; // Long Execute test. It is too heavy to register each single test, treat long execute big groups. diff --git a/onnxruntime/test/optimizer/graph_transform_test.cc b/onnxruntime/test/optimizer/graph_transform_test.cc index a6aa4b946f397..e0f63ea58e772 100755 --- a/onnxruntime/test/optimizer/graph_transform_test.cc +++ b/onnxruntime/test/optimizer/graph_transform_test.cc @@ -32,6 +32,7 @@ #include "core/optimizer/conv_add_fusion.h" #include "core/optimizer/conv_bn_fusion.h" #include "core/optimizer/matmul_bn_fusion.h" +#include "core/optimizer/pad_fusion.h" #include "core/optimizer/conv_mul_fusion.h" #include "core/optimizer/div_mul_fusion.h" #include "core/optimizer/dropout_elimination.h" @@ -1080,6 +1081,163 @@ TEST_F(GraphTransformationTests, FuseConvBNNoBias) { } } +TEST_F(GraphTransformationTests, FusePadWithConv) { + constexpr const ORTCHAR_T* model_uri = MODEL_FOLDER "fusion/fuse-pad-conv.onnx"; + + std::shared_ptr p_model; + ASSERT_STATUS_OK(Model::Load(model_uri, p_model, nullptr, *logger_)); + Graph& graph = p_model->MainGraph(); + + std::vector expected_pads; + GraphViewer graphViewer(graph); + for (auto& node_index : graphViewer.GetNodesInTopologicalOrder()) { + auto& node = *graph.GetNode(node_index); + if (node.OpType() == "Pad") { + const auto* pads_proto = graph_utils::GetConstantInitializer(graph, node.InputDefs()[1]->Name()); + Initializer pads{*pads_proto, graph.ModelPath()}; + gsl::span pads_values = pads.DataAsSpan(); + expected_pads.resize(pads_values.size() - 4); + + for (uint32_t pads_index = 2, index = 0; pads_index < pads_values.size() / 2; pads_index++, index++) { + expected_pads[index] = pads_values[pads_index]; + expected_pads[index + (expected_pads.size() / 2)] = pads_values[pads_index + (pads_values.size() / 2)]; + } + } else if (node.OpType() == "Conv") { + auto child_pads = node.GetMutableAttributes()["pads"].mutable_ints(); + for (uint32_t index = 0; index < expected_pads.size(); index++) { + expected_pads[index] += child_pads->Get(index); + } + } + } + + onnxruntime::GraphTransformerManager graph_transformation_mgr{5}; + auto rule_transformer_L1 = std::make_unique("RuleTransformerL1"); + ASSERT_STATUS_OK(rule_transformer_L1->Register(std::make_unique())); + ASSERT_STATUS_OK(graph_transformation_mgr.Register(std::move(rule_transformer_L1), TransformerLevel::Level1)); + + ASSERT_STATUS_OK(graph_transformation_mgr.ApplyTransformers(graph, TransformerLevel::Level1, *logger_)); + + std::map op_to_count = CountOpsInGraph(graph); + ASSERT_EQ(op_to_count["Pad"], 0); + ASSERT_EQ(op_to_count["Conv"], 1); + + for (auto& node : graph.Nodes()) { + if (node.OpType() == "Conv") { + auto child_pads = node.GetMutableAttributes()["pads"].mutable_ints(); + ASSERT_EQ(child_pads->size(), static_cast(expected_pads.size())) + << "fusion should produce the same size of pads integer as the Conv node"; + for (uint32_t index = 0; index < expected_pads.size(); index++) { + ASSERT_EQ(expected_pads[index], child_pads->Get(index)) + << "fusion does not produce correct padding value"; + } + } + } +} + +TEST_F(GraphTransformationTests, FusePadWithMaxPool) { + constexpr const ORTCHAR_T* model_uri = MODEL_FOLDER "fusion/fuse-pad-maxpool.onnx"; + + std::shared_ptr p_model; + ASSERT_STATUS_OK(Model::Load(model_uri, p_model, nullptr, *logger_)); + Graph& graph = p_model->MainGraph(); + + std::vector expected_pads; + GraphViewer graphViewer(graph); + for (auto& node_index : graphViewer.GetNodesInTopologicalOrder()) { + auto& node = *graph.GetNode(node_index); + if (node.OpType() == "Pad") { + const auto* pads_proto = graph_utils::GetConstantInitializer(graph, node.InputDefs()[1]->Name()); + Initializer pads{*pads_proto, graph.ModelPath()}; + gsl::span pads_values = pads.DataAsSpan(); + expected_pads.resize(pads_values.size() - 4); + + for (uint32_t pads_index = 2, index = 0; pads_index < pads_values.size() / 2; pads_index++, index++) { + expected_pads[index] = pads_values[pads_index]; + expected_pads[index + (expected_pads.size() / 2)] = pads_values[pads_index + (pads_values.size() / 2)]; + } + } else if (node.OpType() == "MaxPool") { + auto child_pads = node.GetMutableAttributes()["pads"].mutable_ints(); + for (uint32_t index = 0; index < expected_pads.size(); index++) { + expected_pads[index] += child_pads->Get(index); + } + } + } + + onnxruntime::GraphTransformerManager graph_transformation_mgr{5}; + auto rule_transformer_L1 = std::make_unique("RuleTransformerL1"); + ASSERT_STATUS_OK(rule_transformer_L1->Register(std::make_unique())); + ASSERT_STATUS_OK(graph_transformation_mgr.Register(std::move(rule_transformer_L1), TransformerLevel::Level1)); + + ASSERT_STATUS_OK(graph_transformation_mgr.ApplyTransformers(graph, TransformerLevel::Level1, *logger_)); + + std::map op_to_count = CountOpsInGraph(graph); + ASSERT_EQ(op_to_count["Pad"], 0); + ASSERT_EQ(op_to_count["MaxPool"], 1); + + for (auto& node : graph.Nodes()) { + if (node.OpType() == "MaxPool") { + auto child_pads = node.GetMutableAttributes()["pads"].mutable_ints(); + ASSERT_EQ(child_pads->size(), static_cast(expected_pads.size())) + << "fusion should produce the same size of pads integer as the MaxPool node"; + for (uint32_t index = 0; index < expected_pads.size(); index++) { + ASSERT_EQ(expected_pads[index], child_pads->Get(index)) + << "fusion does not produce correct padding value"; + } + } + } +} + +TEST_F(GraphTransformationTests, FusePadWithMaxPoolOpsetLessThan11) { + constexpr const ORTCHAR_T* model_uri = MODEL_FOLDER "fusion/fuse-pad-maxpool-opset8.onnx"; + + std::shared_ptr p_model; + ASSERT_STATUS_OK(Model::Load(model_uri, p_model, nullptr, *logger_)); + Graph& graph = p_model->MainGraph(); + + std::vector expected_pads; + GraphViewer graphViewer(graph); + for (auto& node_index : graphViewer.GetNodesInTopologicalOrder()) { + auto& node = *graph.GetNode(node_index); + if (node.OpType() == "Pad") { + gsl::span pads_values = node.GetAttributes().at("pads").ints(); + expected_pads.resize(pads_values.size() - 4); + + for (uint32_t pads_index = 2, index = 0; pads_index < pads_values.size() / 2; pads_index++, index++) { + expected_pads[index] = pads_values[pads_index]; + expected_pads[index + (expected_pads.size() / 2)] = pads_values[pads_index + (pads_values.size() / 2)]; + } + } else if (node.OpType() == "MaxPool") { + auto child_pads = node.GetMutableAttributes()["pads"].mutable_ints(); + for (uint32_t index = 0; index < expected_pads.size(); index++) { + expected_pads[index] += child_pads->Get(index); + } + } + } + + onnxruntime::GraphTransformerManager graph_transformation_mgr{5}; + auto rule_transformer_L1 = std::make_unique("RuleTransformerL1"); + ASSERT_STATUS_OK(rule_transformer_L1->Register(std::make_unique())); + ASSERT_STATUS_OK(graph_transformation_mgr.Register(std::move(rule_transformer_L1), TransformerLevel::Level1)); + + ASSERT_STATUS_OK(graph_transformation_mgr.ApplyTransformers(graph, TransformerLevel::Level1, *logger_)); + + std::map op_to_count = CountOpsInGraph(graph); + ASSERT_EQ(op_to_count["Pad"], 0); + ASSERT_EQ(op_to_count["MaxPool"], 1); + + for (auto& node : graph.Nodes()) { + if (node.OpType() == "MaxPool") { + auto child_pads = node.GetMutableAttributes()["pads"].mutable_ints(); + ASSERT_EQ(child_pads->size(), static_cast(expected_pads.size())) + << "fusion should produce the same size of pads integer as the MaxPool node"; + for (uint32_t index = 0; index < expected_pads.size(); index++) { + ASSERT_EQ(expected_pads[index], child_pads->Get(index)) + << "fusion does not produce correct padding value"; + } + } + } +} + TEST_F(GraphTransformationTests, FuseMatmulBNWithInBetweenNodes) { constexpr const ORTCHAR_T* model_uri = MODEL_FOLDER "fusion/fuse-matmul-bn-with-reshape.onnx"; diff --git a/onnxruntime/test/providers/base_tester.cc b/onnxruntime/test/providers/base_tester.cc index 459a8c71ad611..16cce85f7cb0a 100644 --- a/onnxruntime/test/providers/base_tester.cc +++ b/onnxruntime/test/providers/base_tester.cc @@ -399,6 +399,8 @@ bool SetEpsForAllNodes(Graph& graph, const std::vector>& execution_providers, const std::vector>* custom_registries) { const OpSchemaKernelTypeStrResolver kernel_type_str_resolver{}; + const KernelRegistry::TypeConstraintMap type_constraint_map{}; + for (auto& node : graph.Nodes()) { if (node.OpType() == kConstant) continue; @@ -426,13 +428,28 @@ bool SetEpsForAllNodes(Graph& graph, break; } + // check the internal NHWC domain if EP requests NHWC as it may only have a kernel registered in that domain + if (ep->GetPreferredLayout() == DataLayout::NHWC) { + const KernelCreateInfo* kci = nullptr; + auto status = ep->GetKernelRegistry()->TryFindKernel(ep->Type(), + std::string_view(node.OpType()), + std::string_view(kMSInternalNHWCDomain), + node.SinceVersion(), + type_constraint_map, + &kci); + if (status.IsOK() && kci != nullptr) { + found = true; + break; + } + } + // Check the EP has an impl for the node from custom_registries if (custom_registries != nullptr && std::any_of(custom_registries->cbegin(), custom_registries->cend(), - [&](auto reg) { return KernelRegistry::HasImplementationOf( - *reg->GetKernelRegistry(), - node, ep->Type(), - kernel_type_str_resolver); })) { + [&](auto reg) { + return KernelRegistry::HasImplementationOf(*reg->GetKernelRegistry(), node, ep->Type(), + kernel_type_str_resolver); + })) { found = true; break; } @@ -760,7 +777,7 @@ void BaseTester::ExecuteModelForEps( for (const auto& ep : execution_providers) { providers.append(ep->Type() + " "); } - LOGS_DEFAULT(WARNING) << "registered execution providers " << providers << "were unable to run the model."; + LOGS_DEFAULT(WARNING) << "registered execution providers " << providers << " were unable to run the model."; return; } diff --git a/onnxruntime/test/providers/cuda/nhwc/conv_test.cc b/onnxruntime/test/providers/cuda/nhwc/conv_test.cc index be0082f95feb8..13d4546d669e3 100644 --- a/onnxruntime/test/providers/cuda/nhwc/conv_test.cc +++ b/onnxruntime/test/providers/cuda/nhwc/conv_test.cc @@ -21,7 +21,7 @@ struct ConvOp { std::unique_ptr get_test() { RandomValueGenerator random{}; - auto test = std::make_unique("Conv", 7); + auto test = std::make_unique("Conv", 11); // internal NHWC domain starts at opset 11 std::vector input_data = random.Uniform(input_dims, 0.0f, 1.0f); std::vector weight_dims{channels, input_dims[1] / group, kernel_shape[0], kernel_shape[1]}; diff --git a/onnxruntime/test/providers/internal_testing/internal_testing_tests.cc b/onnxruntime/test/providers/internal_testing/internal_testing_tests.cc index f7499fd7ad812..8955a83e66c01 100644 --- a/onnxruntime/test/providers/internal_testing/internal_testing_tests.cc +++ b/onnxruntime/test/providers/internal_testing/internal_testing_tests.cc @@ -203,7 +203,8 @@ TEST(InternalTestingEP, TestMixOfStaticAndCompiledKernels) { } TEST(InternalTestingEP, TestNhwcConversionOfStaticKernels) { - const ORTCHAR_T* ort_model_path = ORT_MODEL_FOLDER "squeezenet/model.onnx"; + // the internal NHWC domain supports opset 11 and later + const ORTCHAR_T* ort_model_path = ORT_MODEL_FOLDER "squeezenet/model_opset11.onnx"; SessionOptions so; // set this if you want to manually inspect the optimized model diff --git a/onnxruntime/test/providers/qnn/pad_op_test.cpp b/onnxruntime/test/providers/qnn/pad_op_test.cpp index e92f0ae770a88..792dbeadfa758 100644 --- a/onnxruntime/test/providers/qnn/pad_op_test.cpp +++ b/onnxruntime/test/providers/qnn/pad_op_test.cpp @@ -167,7 +167,7 @@ TEST_F(QnnCPUBackendTests, Pad2dPadsNotIni) { TEST_F(QnnCPUBackendTests, DISABLED_PadModeReflect) { bool has_constant_value = false; RunPadOpTest(TestInputDef({3, 2}, false, {1.0f, 1.2f, 2.3f, 3.4f, 4.5f, 5.6f}), - TestInputDef({4}, true, {0, 2, 0, 0}), + TestInputDef({4}, true, {0, 1, 0, 0}), TestInputDef({1}, true, {0.0f}), {utils::MakeAttribute("mode", "reflect")}, ExpectedEPNodeAssignment::All, @@ -266,13 +266,37 @@ TEST_F(QnnHTPBackendTests, PadHasConstantValueQuantized) { constant_value_quantized); } -// QNN graph execute error. Error code: 6031 -TEST_F(QnnHTPBackendTests, DISABLED_PadReflectMode) { +TEST_F(QnnHTPBackendTests, PadReflectMode) { + bool has_constant_value_input = false; + RunQDQPadOpTest(TestInputDef({3, 2}, false, {1.0f, 1.2f, 2.3f, 3.4f, 4.5f, 5.6f}), + TestInputDef({4}, true, {0, 1, 0, 0}), + TestInputDef({1}, true, {0.0f}), + {utils::MakeAttribute("mode", "reflect")}, + ExpectedEPNodeAssignment::All, + has_constant_value_input); +} + +// Pad amount should not be greater than shape(input[0])[i] - 1 +TEST_F(QnnHTPBackendTests, PadReflectModeOutOfRangePadAmount) { bool has_constant_value_input = false; RunQDQPadOpTest(TestInputDef({3, 2}, false, {1.0f, 1.2f, 2.3f, 3.4f, 4.5f, 5.6f}), TestInputDef({4}, true, {0, 2, 0, 0}), TestInputDef({1}, true, {0.0f}), {utils::MakeAttribute("mode", "reflect")}, + ExpectedEPNodeAssignment::None, + has_constant_value_input); +} + +TEST_F(QnnHTPBackendTests, Pad4dReflectMode) { + bool has_constant_value_input = false; + RunQDQPadOpTest(TestInputDef({1, 2, 2, 2}, false, + {1.0f, 2.0f, + 3.0f, 4.0f, + 5.0f, 6.0f, + 7.0f, 8.0f}), + TestInputDef({8}, true, {0, 1, 1, 1, 0, 1, 1, 1}), + TestInputDef({1}, true, {0.0f}), + {utils::MakeAttribute("mode", "reflect")}, ExpectedEPNodeAssignment::All, has_constant_value_input); } diff --git a/onnxruntime/test/python/quantization/test_op_gemm.py b/onnxruntime/test/python/quantization/test_op_gemm.py index 54ef1cc1d5446..bac0f6d48e9fc 100644 --- a/onnxruntime/test/python/quantization/test_op_gemm.py +++ b/onnxruntime/test/python/quantization/test_op_gemm.py @@ -192,24 +192,9 @@ def static_quant_test( check_qtype_by_node_type(self, model_int8_path, qnode_io_qtypes) data_reader.rewind() if activation_type_str == "f8e4m3fn" and weight_type_str == "f8e4m3fn": - # QGemm is not implemented for CPU. - try: - check_model_correctness( - self, - model_fp32_path, - model_int8_path, - data_reader.get_next(), - providers=["CUDAExecutionProvider", "CPUExecutionProvider"], - is_gemm=True, - ) - except Exception as e: - if ( - "Type 'tensor(float8e4m3fn)' of input parameter (input_quantized) of operator (QGemm) in node () is invalid." - in str(e) - ): - warnings.warn("Fix this test when QGemm is implemented.") - return - raise e + # QGemm for float 8 is not implemented. The test should be updated when it is. + warnings.warn("Fix this test when QGemm is implemented for float 8 types.") + return else: check_model_correctness(self, model_fp32_path, model_int8_path, data_reader.get_next(), is_gemm=True) diff --git a/onnxruntime/test/python/quantization/test_quantizeblockwise_4bits.py b/onnxruntime/test/python/quantization/test_quantizeblockwise_4bits.py index e03a0167d070a..765825d4b86e3 100644 --- a/onnxruntime/test/python/quantization/test_quantizeblockwise_4bits.py +++ b/onnxruntime/test/python/quantization/test_quantizeblockwise_4bits.py @@ -38,8 +38,8 @@ def quantize_blockwise_4bits_ref(matrix_float: npt.ArrayLike, block_size: int, i matrix_float_padded = np.pad(matrix_float, ((0, pad_len), (0, 0)), "constant") packed = np.zeros((cols, k_blocks, blob_size), dtype="uint8") - scales = np.zeros((cols * k_blocks), dtype=matrix_float_padded.dtype) - zero_point = np.full((cols * k_blocks + 1) // 2, 136, dtype="uint8") + scales = np.zeros((cols, k_blocks), dtype=matrix_float_padded.dtype) + zero_point = np.full((cols, (k_blocks + 1) // 2), 136, dtype="uint8") matrix_float_padded = np.transpose(matrix_float_padded) for n in range(cols): @@ -61,10 +61,12 @@ def quantize_blockwise_4bits_ref(matrix_float: npt.ArrayLike, block_size: int, i zp = min(15, max(0, round(zero_point_fp))) reciprocal_scale = 1.0 / scale if scale != 0 else 0.0 - block_idx = n * k_blocks + k_id // block_size - scales[block_idx] = scale - zp_pair = zero_point[block_idx // 2] - zero_point[block_idx // 2] = ((zp_pair & 0x0F) | (zp << 4)) if (block_idx & 1) else ((zp_pair & 0xF0) | zp) + block_idx = k_id // block_size + scales[n, block_idx] = scale + zp_pair = zero_point[n, block_idx // 2] + zero_point[n, block_idx // 2] = ( + ((zp_pair & 0x0F) | (zp << 4)) if (block_idx & 1) else ((zp_pair & 0xF0) | zp) + ) blk_int0 = np.clip( np.round(np.float32(matrix_float_padded[n, k_id : k_id + block_size : 2] * reciprocal_scale + zp)), @@ -76,7 +78,7 @@ def quantize_blockwise_4bits_ref(matrix_float: npt.ArrayLike, block_size: int, i 0, 15, ).astype("uint8") - packed[n, k_id // block_size] = np.bitwise_or(blk_int0, np.left_shift(blk_int1, 4)) + packed[n, block_idx] = np.bitwise_or(blk_int0, np.left_shift(blk_int1, 4)) return (packed, scales, zero_point) @@ -88,8 +90,8 @@ def quantize_blockwise_4bits_target(matrix_float: npt.ArrayLike, block_size: int k_blocks = (rows + block_size - 1) // block_size packed = np.zeros((cols, k_blocks, block_size // 2), dtype="uint8") - scales = np.zeros((cols * k_blocks), dtype=matrix_float.dtype) - zero_point = np.full((cols * k_blocks + 1) // 2, 136, dtype="uint8") + scales = np.zeros((cols, k_blocks), dtype=matrix_float.dtype) + zero_point = np.full((cols, (k_blocks + 1) // 2), 136, dtype="uint8") from onnxruntime.capi._pybind_state import quantize_matmul_4bits quantize_matmul_4bits(packed, matrix_float, scales, zero_point, block_size, cols, rows, is_symmetric) @@ -116,24 +118,22 @@ def test_quantize_blockwise_4bits(self): assert np.allclose(zero_point_ref, zero_point) for c in range(quant_value_ref.shape[0]): for k in range(quant_value_ref.shape[1]): - block_idx = c * quant_value_ref.shape[1] + k - zp_idx = block_idx // 2 assert np.allclose( dequantize_blockwise_4bits( - quant_value_ref[c][k], - scales_ref[block_idx], - (zero_point_ref[zp_idx] >> 4) - if (block_idx & 1) - else (zero_point_ref[zp_idx] & 0x0F), + quant_value_ref[c, k], + scales_ref[c, k], + (zero_point_ref[c, k // 2] >> 4) + if (k & 1) + else (zero_point_ref[c, k // 2] & 0x0F), min(block_size, rows - k * block_size), ), dequantize_blockwise_4bits( - quant_value[c][k], - scales[block_idx], - (zero_point[zp_idx] >> 4) if (block_idx & 1) else (zero_point[zp_idx] & 0x0F), + quant_value[c, k], + scales[c, k], + (zero_point[c, k // 2] >> 4) if (k & 1) else (zero_point[c, k // 2] & 0x0F), min(block_size, rows - k * block_size), ), - atol=1.2 * abs(scales[block_idx]), + atol=1.2 * abs(scales[c, k]), ) diff --git a/onnxruntime/test/shared_lib/test_inference.cc b/onnxruntime/test/shared_lib/test_inference.cc index 33d50f90333cf..7dee0bc41a6f3 100644 --- a/onnxruntime/test/shared_lib/test_inference.cc +++ b/onnxruntime/test/shared_lib/test_inference.cc @@ -2832,6 +2832,58 @@ TEST(CApiTest, ConfigureCudaArenaAndDemonstrateMemoryArenaShrinkage) { #endif #ifdef USE_TENSORRT +TEST(TensorrtExecutionProviderTest, ShapeTensorTest) { + const auto& api = Ort::GetApi(); + + // Test input tensor which is shape tensor with explicit trt profile shapes + Ort::SessionOptions session_options; + OrtTensorRTProviderOptionsV2* trt_options; + ASSERT_TRUE(api.CreateTensorRTProviderOptions(&trt_options) == nullptr); + std::unique_ptr + rel_trt_options(trt_options, api.ReleaseTensorRTProviderOptions); + + const char* trt_profile_min_shapes = "data:2x2,shape:4x1"; + const char* trt_profile_max_shapes = "data:2x2,shape:4x1"; + const char* trt_profile_opt_shapes = "data:2x2,shape:4x1"; + std::vector keys{"trt_profile_min_shapes", "trt_profile_max_shapes", "trt_profile_opt_shapes"}; + std::vector values{trt_profile_min_shapes, trt_profile_max_shapes, trt_profile_opt_shapes}; + ASSERT_TRUE(api.UpdateTensorRTProviderOptions(rel_trt_options.get(), keys.data(), values.data(), keys.size()) == nullptr); + ASSERT_TRUE(api.SessionOptionsAppendExecutionProvider_TensorRT_V2( + static_cast(session_options), + rel_trt_options.get()) == nullptr); + + auto model_path = ORT_TSTR("testdata/trt_reshape.onnx"); + + std::vector input_value_0{1.1f, 1.2f, 1.3f, 1.4f}; + std::vector input_shape_0{2, 2}; + std::vector input_value_1{4, 1}; + std::vector input_shape_1{2}; + + std::vector input_names{"data", "shape"}; + Ort::MemoryInfo info("Cpu", OrtDeviceAllocator, 0, OrtMemTypeDefault); + + std::vector ort_inputs; + ort_inputs.emplace_back(Ort::Value::CreateTensor(info, input_value_0.data(), input_value_0.size(), input_shape_0.data(), input_shape_0.size())); + ort_inputs.emplace_back(Ort::Value::CreateTensor(info, input_value_1.data(), input_value_1.size(), input_shape_1.data(), input_shape_1.size())); + + const char* output_names[] = {"reshaped"}; + + Ort::Session session(*ort_env, model_path, session_options); + session.Run(Ort::RunOptions{}, input_names.data(), ort_inputs.data(), ort_inputs.size(), output_names, countof(output_names)); + + // Test input tensor which is shape tensor with implicit trt profile shapes + Ort::SessionOptions session_options_2; + OrtTensorRTProviderOptionsV2* trt_options_2; + ASSERT_TRUE(api.CreateTensorRTProviderOptions(&trt_options_2) == nullptr); + std::unique_ptr + rel_trt_options_2(trt_options_2, api.ReleaseTensorRTProviderOptions); + ASSERT_TRUE(api.SessionOptionsAppendExecutionProvider_TensorRT_V2( + static_cast(session_options_2), + rel_trt_options_2.get()) == nullptr); + Ort::Session session_2(*ort_env, model_path, session_options_2); + session_2.Run(Ort::RunOptions{}, input_names.data(), ort_inputs.data(), ort_inputs.size(), output_names, countof(output_names)); +} + TEST(CApiTest, TestExternalCUDAStreamWithIOBinding) { const auto& api = Ort::GetApi(); Ort::SessionOptions session_options; diff --git a/onnxruntime/test/testdata/squeezenet/model_opset11.onnx b/onnxruntime/test/testdata/squeezenet/model_opset11.onnx new file mode 100644 index 0000000000000..dcf322a58c042 Binary files /dev/null and b/onnxruntime/test/testdata/squeezenet/model_opset11.onnx differ diff --git a/onnxruntime/test/testdata/transform/fusion/fuse-pad-conv.onnx b/onnxruntime/test/testdata/transform/fusion/fuse-pad-conv.onnx new file mode 100644 index 0000000000000..ced1950005985 Binary files /dev/null and b/onnxruntime/test/testdata/transform/fusion/fuse-pad-conv.onnx differ diff --git a/onnxruntime/test/testdata/transform/fusion/fuse-pad-maxpool-opset8.onnx b/onnxruntime/test/testdata/transform/fusion/fuse-pad-maxpool-opset8.onnx new file mode 100644 index 0000000000000..feb1f024ceed7 Binary files /dev/null and b/onnxruntime/test/testdata/transform/fusion/fuse-pad-maxpool-opset8.onnx differ diff --git a/onnxruntime/test/testdata/transform/fusion/fuse-pad-maxpool.onnx b/onnxruntime/test/testdata/transform/fusion/fuse-pad-maxpool.onnx new file mode 100644 index 0000000000000..32e959262f6b5 Binary files /dev/null and b/onnxruntime/test/testdata/transform/fusion/fuse-pad-maxpool.onnx differ diff --git a/onnxruntime/test/testdata/trt_reshape.onnx b/onnxruntime/test/testdata/trt_reshape.onnx new file mode 100644 index 0000000000000..7d195af2ae204 --- /dev/null +++ b/onnxruntime/test/testdata/trt_reshape.onnx @@ -0,0 +1,16 @@ + :‰ +) +data +shapereshapedReshape"Reshapetrt_engine_wrapperZ +data +  +N +Z +shape + + +b +reshaped +  + +B \ No newline at end of file diff --git a/onnxruntime/test/testdata/trt_reshape_test.py b/onnxruntime/test/testdata/trt_reshape_test.py new file mode 100644 index 0000000000000..42777bd3d50c7 --- /dev/null +++ b/onnxruntime/test/testdata/trt_reshape_test.py @@ -0,0 +1,42 @@ +#!/usr/bin/env python3 +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. + +import onnx +from onnx import TensorProto, helper + + +def generate_model(model_name): + nodes = [ + helper.make_node( + "Reshape", + ["data", "shape"], + ["reshaped"], + "Reshape", + ), + ] + + graph = helper.make_graph( + nodes, + "trt_engine_wrapper", + [ # input + helper.make_tensor_value_info("data", TensorProto.FLOAT, ["N", 2]), + helper.make_tensor_value_info( + "shape", + TensorProto.INT64, + [ + 2, + ], + ), + ], + [ # output + helper.make_tensor_value_info("reshaped", TensorProto.FLOAT, [4, 1]), + ], + ) + + model = helper.make_model(graph) + onnx.save(model, model_name) + + +if __name__ == "__main__": + generate_model("trt_reshape.onnx") diff --git a/tools/ci_build/build.py b/tools/ci_build/build.py index a992da8ff993e..25d69ef3a1eb5 100644 --- a/tools/ci_build/build.py +++ b/tools/ci_build/build.py @@ -796,6 +796,7 @@ def run_subprocess( my_env.update(env) + log.info(" ".join(args)) return run(*args, cwd=cwd, capture_stdout=capture_stdout, shell=shell, env=my_env) @@ -2024,13 +2025,6 @@ def build_python_wheel( run_subprocess(args, cwd=cwd) -def derive_linux_build_property(): - if is_windows(): - return '/p:IsLinuxBuild="false"' - else: - return '/p:IsLinuxBuild="true"' - - def build_nuget_package( cmake_path, source_dir, @@ -2043,7 +2037,6 @@ def build_nuget_package( use_dnnl, use_tvm, use_winml, - use_snpe, use_qnn, enable_training_apis, msbuild_extra_options, @@ -2054,83 +2047,93 @@ def build_nuget_package( ) csharp_build_dir = os.path.join(source_dir, "csharp") - is_linux_build = derive_linux_build_property() # in most cases we don't want/need to include the Xamarin mobile targets, as doing so means the Xamarin # mobile workloads must be installed on the machine. # they are only included in the Microsoft.ML.OnnxRuntime nuget package sln = "OnnxRuntime.DesktopOnly.CSharp.sln" + have_exclude_mobile_targets_option = "IncludeMobileTargets=false" in msbuild_extra_options # derive package name and execution provider based on the build args target_name = "/t:CreatePackage" - execution_provider = '/p:ExecutionProvider="None"' - package_name = '/p:OrtPackageId="Microsoft.ML.OnnxRuntime"' - enable_training_tests = '/p:TrainingEnabledNativeBuild="false"' + execution_provider = "/p:ExecutionProvider=None" + package_name = "/p:OrtPackageId=Microsoft.ML.OnnxRuntime" + enable_training_tests = "/p:TrainingEnabledNativeBuild=false" + if enable_training_apis: - enable_training_tests = '/p:TrainingEnabledNativeBuild="true"' + enable_training_tests = "/p:TrainingEnabledNativeBuild=true" if use_cuda: - package_name = '/p:OrtPackageId="Microsoft.ML.OnnxRuntime.Training.Gpu"' + package_name = "/p:OrtPackageId=Microsoft.ML.OnnxRuntime.Training.Gpu" else: - package_name = '/p:OrtPackageId="Microsoft.ML.OnnxRuntime.Training"' + package_name = "/p:OrtPackageId=Microsoft.ML.OnnxRuntime.Training" elif use_winml: - package_name = '/p:OrtPackageId="Microsoft.AI.MachineLearning"' + package_name = "/p:OrtPackageId=Microsoft.AI.MachineLearning" target_name = "/t:CreateWindowsAIPackage" elif use_openvino: - execution_provider = '/p:ExecutionProvider="openvino"' - package_name = '/p:OrtPackageId="Microsoft.ML.OnnxRuntime.OpenVino"' + execution_provider = "/p:ExecutionProvider=openvino" + package_name = "/p:OrtPackageId=Microsoft.ML.OnnxRuntime.OpenVino" elif use_tensorrt: - execution_provider = '/p:ExecutionProvider="tensorrt"' - package_name = '/p:OrtPackageId="Microsoft.ML.OnnxRuntime.TensorRT"' + execution_provider = "/p:ExecutionProvider=tensorrt" + package_name = "/p:OrtPackageId=Microsoft.ML.OnnxRuntime.TensorRT" elif use_dnnl: - execution_provider = '/p:ExecutionProvider="dnnl"' - package_name = '/p:OrtPackageId="Microsoft.ML.OnnxRuntime.DNNL"' + execution_provider = "/p:ExecutionProvider=dnnl" + package_name = "/p:OrtPackageId=Microsoft.ML.OnnxRuntime.DNNL" elif use_cuda: - package_name = '/p:OrtPackageId="Microsoft.ML.OnnxRuntime.Gpu"' + package_name = "/p:OrtPackageId=Microsoft.ML.OnnxRuntime.Gpu" elif use_rocm: - package_name = '/p:OrtPackageId="Microsoft.ML.OnnxRuntime.ROCm"' + package_name = "/p:OrtPackageId=Microsoft.ML.OnnxRuntime.ROCm" elif use_tvm: - execution_provider = '/p:ExecutionProvider="tvm"' - package_name = '/p:OrtPackageId="Microsoft.ML.OnnxRuntime.Tvm"' - elif use_snpe: - execution_provider = '/p:ExecutionProvider="snpe"' - package_name = '/p:OrtPackageId="Microsoft.ML.OnnxRuntime.Snpe"' + execution_provider = "/p:ExecutionProvider=tvm" + package_name = "/p:OrtPackageId=Microsoft.ML.OnnxRuntime.Tvm" elif use_qnn: - execution_provider = '/p:ExecutionProvider="qnn"' - package_name = '/p:OrtPackageId="Microsoft.ML.OnnxRuntime.QNN"' + execution_provider = "/p:ExecutionProvider=qnn" + package_name = "/p:OrtPackageId=Microsoft.ML.OnnxRuntime.QNN" elif any(map(lambda x: "OrtPackageId=" in x, msbuild_extra_options)): pass else: - # use the solution file that includes Xamarin mobile targets - sln = "OnnxRuntime.CSharp.sln" + # we currently only allow building with mobile targets on Windows. + # it should be possible to allow building with android targets on Linux but that requires updating the + # csproj to separate the inclusion of ios and android targets. + if is_windows() and have_exclude_mobile_targets_option is False: + # use the sln that include the mobile targets + sln = "OnnxRuntime.CSharp.sln" + + # explicitly exclude mobile targets in this case + if sln != "OnnxRuntime.CSharp.sln" and have_exclude_mobile_targets_option is False: + msbuild_extra_options.append("IncludeMobileTargets=false") + + # expand extra_options to add prefix + extra_options = ["/p:" + option for option in msbuild_extra_options] + + # we have to use msbuild directly if including Xamarin targets as dotnet only supports MAUI (.net6) + use_dotnet = sln != "OnnxRuntime.CSharp.sln" + + if use_dotnet: + cmd_args = ["dotnet", "restore", sln, "--configfile", "NuGet.CSharp.config", *extra_options] + else: + cmd_args = ["msbuild", sln, "/t:restore", "/p:RestoreConfigFile=NuGet.CSharp.config", *extra_options] # set build directory based on build_dir arg native_dir = os.path.normpath(os.path.join(source_dir, build_dir)) - ort_build_dir = '/p:OnnxRuntimeBuildDirectory="' + native_dir + '"' + ort_build_dir = "/p:OnnxRuntimeBuildDirectory=" + native_dir - # dotnet restore - cmd_args = ["dotnet", "restore", sln, "--configfile", "NuGet.CSharp.config"] run_subprocess(cmd_args, cwd=csharp_build_dir) # build csharp bindings and create nuget package for each config for config in configs: - if is_linux(): - native_build_dir = os.path.join(native_dir, config) - cmd_args = [cmake_path, "-DCMAKE_INSTALL_PREFIX=./nuget-staging/usr/local", "-Pcmake_install.cmake"] - run_subprocess(cmd_args, cwd=native_build_dir) - - configuration = '/p:Configuration="' + config + '"' - + configuration = "/p:Configuration=" + config if not use_winml: - cmd_args = [ - "dotnet", + cmd_args = ["dotnet"] if use_dotnet else [] + cmd_args += [ "msbuild", sln, configuration, package_name, - is_linux_build, ort_build_dir, enable_training_tests, + *extra_options, ] + run_subprocess(cmd_args, cwd=csharp_build_dir) else: winml_interop_dir = os.path.join(source_dir, "csharp", "src", "Microsoft.AI.MachineLearning.Interop") @@ -2141,7 +2144,7 @@ def build_nuget_package( "msbuild", winml_interop_project, configuration, - '/p:Platform="Any CPU"', + "/p:Platform=Any CPU", ort_build_dir, "-restore", ] @@ -2155,26 +2158,28 @@ def build_nuget_package( # this path is setup by cmake/nuget_helpers.cmake for MSVC on Windows nuget_exe = os.path.normpath(os.path.join(native_dir, config, "nuget_exe", "src", "nuget.exe")) else: - # user needs to make sure nuget is installed and can be found - nuget_exe = "nuget" + # `dotnet pack` is used on Linux + nuget_exe = "NugetExe_not_set" nuget_exe_arg = '/p:NugetExe="' + nuget_exe + '"' - cmd_args = [ - "dotnet", + cmd_args = ["dotnet"] if use_dotnet else [] + cmd_args += [ "msbuild", "OnnxRuntime.CSharp.proj", target_name, package_name, configuration, execution_provider, - is_linux_build, ort_build_dir, nuget_exe_arg, + *extra_options, ] - cmd_args.extend(msbuild_extra_options) + run_subprocess(cmd_args, cwd=csharp_build_dir) + log.info(f"nuget package was created in the {config} build output directory.") + def run_csharp_tests(source_dir, build_dir, use_cuda, use_openvino, use_tensorrt, use_dnnl, enable_training_apis): # Currently only running tests on windows. @@ -2637,6 +2642,7 @@ def main(): enable_training_apis=args.enable_training_apis, enable_rocm_profiling=args.enable_rocm_profiling, ) + if args.build_nuget: build_nuget_package( cmake_path, @@ -2650,7 +2656,6 @@ def main(): args.use_dnnl, args.use_tvm, args.use_winml, - args.use_snpe, args.use_qnn, args.enable_training_apis, normalize_arg_list(args.msbuild_extra_options), diff --git a/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml b/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml index 14a9bbedf09a0..ac07d8c525372 100644 --- a/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml +++ b/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml @@ -716,44 +716,29 @@ stages: versionSpec: 6.2.1 - task: PowerShell@2 - displayName: Install .NET 6 workloads + displayName: Install mobile workloads inputs: targetType: 'inline' script: | - dotnet workload install android ios macos - workingDirectory: '$(Build.SourcesDirectory)\csharp' - - - task: PowerShell@2 - displayName: Build .NET 6 targets using dotnet - inputs: - targetType: 'inline' - # we don't specify 'Any CPU' as the platform here because if we do it gets added to the output path - # e.g. csharp\src\Microsoft.ML.OnnxRuntime\bin\Any CPU\RelWithDebInfo\net6.0-ios\ - # which is inconsistent with the msbuild output path for the pre-.net6 targets - # e.g. csharp\src\Microsoft.ML.OnnxRuntime\bin\RelWithDebInfo\monoandroid11.0 - # and makes it harder to do the packing - # - # 'Any CPU' is the default (first 'mixed' platform specified in the csproj) so this should be fine. - script: | - dotnet build .\src\Microsoft.ML.OnnxRuntime\Microsoft.ML.OnnxRuntime.csproj -p:SelectedTargets=Net6 -p:Configuration=RelWithDebInfo -p:OnnxRuntimeBuildDirectory="$(Build.BinariesDirectory)" -p:OrtPackageId="Microsoft.ML.OnnxRuntime.Gpu" -p:IsReleaseBuild=${{ parameters.IsReleaseBuild }} -p:ReleaseVersionSuffix=$(ReleaseVersionSuffix) + dotnet workload install android ios workingDirectory: '$(Build.SourcesDirectory)\csharp' - task: MSBuild@1 - displayName: 'Restore NuGet Packages and create project.assets.json for pre-.net6 targets' + displayName: 'Restore NuGet Packages and create project.assets.json' inputs: solution: '$(Build.SourcesDirectory)\csharp\OnnxRuntime.CSharp.sln' platform: 'Any CPU' configuration: RelWithDebInfo - msbuildArguments: '-t:restore -p:SelectedTargets=PreNet6 -p:OrtPackageId="Microsoft.ML.OnnxRuntime.Gpu"' + msbuildArguments: '-t:restore -p:OrtPackageId="Microsoft.ML.OnnxRuntime.Gpu"' workingDirectory: '$(Build.SourcesDirectory)\csharp' - task: MSBuild@1 - displayName: 'Build C# for pre-.net6 targets' + displayName: 'Build C# bindings' inputs: solution: '$(Build.SourcesDirectory)\csharp\OnnxRuntime.CSharp.sln' configuration: RelWithDebInfo platform: 'Any CPU' - msbuildArguments: '-p:SelectedTargets=PreNet6 -p:OnnxRuntimeBuildDirectory="$(Build.BinariesDirectory)" -p:OrtPackageId="Microsoft.ML.OnnxRuntime.Gpu" -p:IsReleaseBuild=${{ parameters.IsReleaseBuild }} -p:ReleaseVersionSuffix=$(ReleaseVersionSuffix)' + msbuildArguments: '-p:OnnxRuntimeBuildDirectory="$(Build.BinariesDirectory)" -p:OrtPackageId="Microsoft.ML.OnnxRuntime.Gpu" -p:IsReleaseBuild=${{ parameters.IsReleaseBuild }} -p:ReleaseVersionSuffix=$(ReleaseVersionSuffix)' workingDirectory: '$(Build.SourcesDirectory)\csharp' - template: templates/win-esrp-dll.yml @@ -762,15 +747,6 @@ stages: DisplayName: 'ESRP - Sign C# dlls' DoEsrp: ${{ parameters.DoEsrp }} - - task: MSBuild@1 - displayName: Update projects.assets.json with combined list of all target frameworks - inputs: - solution: '$(Build.SourcesDirectory)\csharp\src\Microsoft.ML.OnnxRuntime\Microsoft.ML.OnnxRuntime.csproj' - platform: 'Any CPU' - configuration: RelWithDebInfo - msbuildArguments: '-t:restore -p:SelectedTargets=All -p:OrtPackageId=Microsoft.ML.OnnxRuntime.Gpu' - workingDirectory: '$(Build.SourcesDirectory)\csharp' - - task: MSBuild@1 displayName: 'Build Nuget Packages' inputs: diff --git a/tools/ci_build/github/azure-pipelines/nuget/templates/dml-vs-2022.yml b/tools/ci_build/github/azure-pipelines/nuget/templates/dml-vs-2022.yml index 81e8d67b79021..2d92108efb46d 100644 --- a/tools/ci_build/github/azure-pipelines/nuget/templates/dml-vs-2022.yml +++ b/tools/ci_build/github/azure-pipelines/nuget/templates/dml-vs-2022.yml @@ -137,7 +137,7 @@ stages: - task: MSBuild@1 displayName: 'Restore NuGet Packages' inputs: - solution: '$(Build.SourcesDirectory)\csharp\OnnxRuntime.CSharp.sln' + solution: '$(Build.SourcesDirectory)\csharp\OnnxRuntime.DesktopOnly.CSharp.sln' platform: 'Any CPU' configuration: '$(BuildConfig)' msbuildArguments: '-t:restore -p:OrtPackageId=${{ parameters.OrtPackageId }}' @@ -146,7 +146,7 @@ stages: - task: MSBuild@1 displayName: 'Build C#' inputs: - solution: '$(Build.SourcesDirectory)\csharp\OnnxRuntime.CSharp.sln' + solution: '$(Build.SourcesDirectory)\csharp\OnnxRuntime.DesktopOnly.CSharp.sln' configuration: '$(BuildConfig)' platform: 'Any CPU' msbuildArguments: '-p:OnnxRuntimeBuildDirectory="$(Build.BinariesDirectory)" -p:OrtPackageId=${{ parameters.OrtPackageId }} -p:IsReleaseBuild=${{ parameters.IsReleaseBuild }}' diff --git a/tools/ci_build/github/azure-pipelines/templates/c-api-cpu.yml b/tools/ci_build/github/azure-pipelines/templates/c-api-cpu.yml index af245c99700ec..4ce39ecc35bfb 100644 --- a/tools/ci_build/github/azure-pipelines/templates/c-api-cpu.yml +++ b/tools/ci_build/github/azure-pipelines/templates/c-api-cpu.yml @@ -398,44 +398,29 @@ stages: versionSpec: 6.2.1 - task: PowerShell@2 - displayName: Install .NET 6 workloads + displayName: Install mobile workloads inputs: targetType: 'inline' script: | - dotnet workload install android ios macos - workingDirectory: '$(Build.SourcesDirectory)\csharp' - - - task: PowerShell@2 - displayName: Build Microsoft.ML.OnnxRuntime .NET 6 targets using dotnet - inputs: - targetType: 'inline' - # we don't specify 'Any CPU' as the platform here because if we do it gets added to the output path - # e.g. csharp\src\Microsoft.ML.OnnxRuntime\bin\Any CPU\RelWithDebInfo\net6.0-ios\ - # which is inconsistent with the msbuild output path for the pre-.net6 targets - # e.g. csharp\src\Microsoft.ML.OnnxRuntime\bin\RelWithDebInfo\monoandroid11.0 - # and makes it harder to do the packing - # - # 'Any CPU' is the default (first 'mixed' platform specified in the csproj) so this should be fine. - script: | - dotnet build .\src\Microsoft.ML.OnnxRuntime\Microsoft.ML.OnnxRuntime.csproj -p:SelectedTargets=Net6 -p:Configuration=RelWithDebInfo -p:OnnxRuntimeBuildDirectory="$(Build.BinariesDirectory)" -p:OrtPackageId=$(OrtPackageId) -p:IsReleaseBuild=${{ parameters.IsReleaseBuild }} -p:ReleaseVersionSuffix=$(ReleaseVersionSuffix) + dotnet workload install android ios workingDirectory: '$(Build.SourcesDirectory)\csharp' - task: MSBuild@1 - displayName: 'Restore NuGet Packages and create project.assets.json for pre-.net6 targets' + displayName: 'Restore NuGet Packages and create project.assets.json' inputs: solution: '$(Build.SourcesDirectory)\csharp\OnnxRuntime.CSharp.sln' platform: 'Any CPU' configuration: RelWithDebInfo - msbuildArguments: '-t:restore -p:SelectedTargets=PreNet6 -p:OrtPackageId=$(OrtPackageId)' + msbuildArguments: '-t:restore -p:OrtPackageId=$(OrtPackageId)' workingDirectory: '$(Build.SourcesDirectory)\csharp' - task: MSBuild@1 - displayName: 'Build C# for pre-.net6 targets' + displayName: 'Build C# bindings' inputs: solution: '$(Build.SourcesDirectory)\csharp\OnnxRuntime.CSharp.sln' platform: 'Any CPU' configuration: RelWithDebInfo - msbuildArguments: '-p:SelectedTargets=PreNet6 -p:OnnxRuntimeBuildDirectory="$(Build.BinariesDirectory)" -p:OrtPackageId=$(OrtPackageId) -p:IsReleaseBuild=${{ parameters.IsReleaseBuild }} -p:ReleaseVersionSuffix=$(ReleaseVersionSuffix)' + msbuildArguments: '-p:OnnxRuntimeBuildDirectory="$(Build.BinariesDirectory)" -p:OrtPackageId=$(OrtPackageId) -p:IsReleaseBuild=${{ parameters.IsReleaseBuild }} -p:ReleaseVersionSuffix=$(ReleaseVersionSuffix)' workingDirectory: '$(Build.SourcesDirectory)\csharp' - ${{ if eq(parameters.DoEsrp, true) }}: @@ -445,15 +430,6 @@ stages: DisplayName: 'ESRP - Sign C# dlls' DoEsrp: ${{ parameters.DoEsrp }} - - task: MSBuild@1 - displayName: Update projects.assets.json with combined list of all target frameworks - inputs: - solution: '$(Build.SourcesDirectory)\csharp\src\Microsoft.ML.OnnxRuntime\Microsoft.ML.OnnxRuntime.csproj' - platform: 'Any CPU' - configuration: RelWithDebInfo - msbuildArguments: '-t:restore -p:SelectedTargets=All -p:OrtPackageId=$(OrtPackageId)' - workingDirectory: '$(Build.SourcesDirectory)\csharp' - - task: MSBuild@1 displayName: 'Build Nuget Packages' inputs: diff --git a/tools/ci_build/github/azure-pipelines/templates/download-deps.yml b/tools/ci_build/github/azure-pipelines/templates/download-deps.yml index 0f6310724e9a1..dc41a2d398893 100644 --- a/tools/ci_build/github/azure-pipelines/templates/download-deps.yml +++ b/tools/ci_build/github/azure-pipelines/templates/download-deps.yml @@ -11,7 +11,7 @@ steps: packageType: upack feed: '/7424c8e4-5c62-490e-95c4-79446f31017c' definition: '517c4f6f-5437-4392-a70d-4f15ec5be2f0' - version: 1.0.104 + version: 1.0.107 downloadPath: $(Build.BinariesDirectory)/deps # The private ADO project @@ -22,7 +22,7 @@ steps: packageType: upack feed: '/4c7631f5-24c0-4307-8822-1aa8f180c325' definition: 'fd9dd5ad-b73e-4678-890e-edcf680dbc1a' - version: 1.0.104 + version: 1.0.107 downloadPath: $(Build.BinariesDirectory)/deps # You can add more ADO accounts at here. diff --git a/tools/ci_build/github/azure-pipelines/templates/jobs/win-ci-vs-2022-job.yml b/tools/ci_build/github/azure-pipelines/templates/jobs/win-ci-vs-2022-job.yml index 3b1fde6cb6e4f..404699f705344 100644 --- a/tools/ci_build/github/azure-pipelines/templates/jobs/win-ci-vs-2022-job.yml +++ b/tools/ci_build/github/azure-pipelines/templates/jobs/win-ci-vs-2022-job.yml @@ -169,7 +169,7 @@ jobs: - task: MSBuild@1 displayName: 'Restore NuGet Packages' inputs: - solution: '$(Build.SourcesDirectory)\csharp\OnnxRuntime.CSharp.sln' + solution: '$(Build.SourcesDirectory)\csharp\OnnxRuntime.DesktopOnly.CSharp.sln' platform: 'Any CPU' configuration: '${{ parameters.BuildConfig }}' msbuildArguments: '-t:restore -p:OrtPackageId=$(OrtPackageId)' @@ -178,7 +178,7 @@ jobs: - task: MSBuild@1 displayName: 'Build C#' inputs: - solution: '$(Build.SourcesDirectory)\csharp\OnnxRuntime.CSharp.sln' + solution: '$(Build.SourcesDirectory)\csharp\OnnxRuntime.DesktopOnly.CSharp.sln' configuration: '${{ parameters.BuildConfig }}' platform: 'Any CPU' msbuildArguments: '-p:OnnxRuntimeBuildDirectory="$(Build.BinariesDirectory)" -p:OrtPackageId=$(OrtPackageId)' @@ -197,7 +197,7 @@ jobs: command: test projects: '$(Build.SourcesDirectory)\csharp\test\Microsoft.ML.OnnxRuntime.Tests.NetCoreApp\Microsoft.ML.OnnxRuntime.Tests.NetCoreApp.csproj' configuration: '${{ parameters.BuildConfig }}' - arguments: '--configuration ${{ parameters.BuildConfig }} -p:Platform="Any CPU" -p:OnnxRuntimeBuildDirectory="$(Build.BinariesDirectory)" -p:OrtPackageId=$(OrtPackageId) --blame' + arguments: '--configuration ${{ parameters.BuildConfig }} -p:Platform="Any CPU" -p:OnnxRuntimeBuildDirectory="$(Build.BinariesDirectory)" -p:OrtPackageId=$(OrtPackageId) -p:IncludeMobileTargets=false --blame' workingDirectory: '$(Build.SourcesDirectory)\csharp' - ${{ if eq(parameters.EnablePython, true) }}: diff --git a/tools/ci_build/github/azure-pipelines/templates/ondevice-training-cpu-packaging-pipeline.yml b/tools/ci_build/github/azure-pipelines/templates/ondevice-training-cpu-packaging-pipeline.yml index 792e828c9a880..24e46066a1f10 100644 --- a/tools/ci_build/github/azure-pipelines/templates/ondevice-training-cpu-packaging-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/templates/ondevice-training-cpu-packaging-pipeline.yml @@ -222,44 +222,29 @@ stages: versionSpec: 6.2.1 - task: PowerShell@2 - displayName: Install .NET 6 workloads + displayName: Install mobile workloads inputs: targetType: 'inline' script: | dotnet workload install android workingDirectory: '$(Build.SourcesDirectory)\csharp' - - task: PowerShell@2 - displayName: Build Microsoft.ML.OnnxRuntime .NET 6 targets using dotnet - inputs: - targetType: 'inline' - # we don't specify 'Any CPU' as the platform here because if we do it gets added to the output path - # e.g. csharp\src\Microsoft.ML.OnnxRuntime\bin\Any CPU\RelWithDebInfo\net6.0-ios\ - # which is inconsistent with the msbuild output path for the pre-.net6 targets - # e.g. csharp\src\Microsoft.ML.OnnxRuntime\bin\RelWithDebInfo\monoandroid11.0 - # and makes it harder to do the packing - # - # 'Any CPU' is the default (first 'mixed' platform specified in the csproj) so this should be fine. - script: | - dotnet build .\src\Microsoft.ML.OnnxRuntime\Microsoft.ML.OnnxRuntime.csproj -p:SelectedTargets=Net6 -p:Configuration=RelWithDebInfo -p:OnnxRuntimeBuildDirectory="$(Build.BinariesDirectory)" -p:OrtPackageId=$(OrtPackageId) -p:IsReleaseBuild=${{ parameters.IsReleaseBuild }} -p:ReleaseVersionSuffix=$(ReleaseVersionSuffix) - workingDirectory: '$(Build.SourcesDirectory)\csharp' - - task: MSBuild@1 - displayName: 'Restore NuGet Packages and create project.assets.json for pre-.net6 targets' + displayName: 'Restore NuGet Packages and create project.assets.json' inputs: solution: '$(Build.SourcesDirectory)\csharp\OnnxRuntime.CSharp.sln' platform: 'Any CPU' configuration: RelWithDebInfo - msbuildArguments: '-t:restore -p:SelectedTargets=PreNet6 -p:OrtPackageId=$(OrtPackageId)' + msbuildArguments: '-t:restore -p:OrtPackageId=$(OrtPackageId)' workingDirectory: '$(Build.SourcesDirectory)\csharp' - task: MSBuild@1 - displayName: 'Build C# for pre-.net6 targets' + displayName: 'Build C# bindings' inputs: solution: '$(Build.SourcesDirectory)\csharp\OnnxRuntime.CSharp.sln' platform: 'Any CPU' configuration: RelWithDebInfo - msbuildArguments: '-p:SelectedTargets=PreNet6 -p:OnnxRuntimeBuildDirectory="$(Build.BinariesDirectory)" -p:OrtPackageId=$(OrtPackageId) -p:IsReleaseBuild=${{ parameters.IsReleaseBuild }}' + msbuildArguments: '-p:OnnxRuntimeBuildDirectory="$(Build.BinariesDirectory)" -p:OrtPackageId=$(OrtPackageId) -p:IsReleaseBuild=${{ parameters.IsReleaseBuild }}' workingDirectory: '$(Build.SourcesDirectory)\csharp' - ${{ if eq(parameters.DoEsrp, true) }}: @@ -269,15 +254,6 @@ stages: DisplayName: 'ESRP - Sign C# dlls' DoEsrp: ${{ parameters.DoEsrp }} - - task: MSBuild@1 - displayName: Update projects.assets.json with combined list of all target frameworks - inputs: - solution: '$(Build.SourcesDirectory)\csharp\src\Microsoft.ML.OnnxRuntime\Microsoft.ML.OnnxRuntime.csproj' - platform: 'Any CPU' - configuration: RelWithDebInfo - msbuildArguments: '-t:restore -p:SelectedTargets=All -p:OrtPackageId=$(OrtPackageId)' - workingDirectory: '$(Build.SourcesDirectory)\csharp' - - task: MSBuild@1 displayName: 'Build Nuget Packages' inputs: diff --git a/tools/ci_build/github/azure-pipelines/templates/win-web-ci.yml b/tools/ci_build/github/azure-pipelines/templates/win-web-ci.yml index 187c7656602f5..8c926619c797d 100644 --- a/tools/ci_build/github/azure-pipelines/templates/win-web-ci.yml +++ b/tools/ci_build/github/azure-pipelines/templates/win-web-ci.yml @@ -161,11 +161,13 @@ jobs: workingDirectory: '$(Build.SourcesDirectory)\js\web' displayName: 'Run ort-web tests (wasm,webgl,xnnpack backend)' condition: eq('${{ parameters.RunWebGpuTests }}', 'false') + retryCountOnTaskFailure: 3 - script: | npm test -- -e=edge -b=webgl,wasm,xnnpack,webgpu $(webgpuCommandlineExtraFlags) workingDirectory: '$(Build.SourcesDirectory)\js\web' displayName: 'Run ort-web tests (ALL backends)' condition: eq('${{ parameters.RunWebGpuTests }}', 'true') + retryCountOnTaskFailure: 3 - script: | npm test -- suite1 -e=edge -b=webgpu --io-binding=gpu-tensor $(webgpuCommandlineExtraFlags) workingDirectory: '$(Build.SourcesDirectory)\js\web' @@ -180,10 +182,12 @@ jobs: workingDirectory: '$(Build.SourcesDirectory)\js\web' displayName: 'Run ort-web tests (Suite1, webgpu, IO-binding=gpu-location)' condition: eq('${{ parameters.RunWebGpuTests }}', 'true') + retryCountOnTaskFailure: 3 - script: | npm test -- --webgl-texture-pack-mode -b=webgl -e=edge workingDirectory: '$(Build.SourcesDirectory)\js\web' displayName: 'Run ort-web tests - WebGL: packed mode' + retryCountOnTaskFailure: 3 - script: | npm test -- --wasm-enable-proxy -b=wasm -e=edge workingDirectory: '$(Build.SourcesDirectory)\js\web' diff --git a/tools/nuget/generate_nuspec_for_native_nuget.py b/tools/nuget/generate_nuspec_for_native_nuget.py index f7b68551b9c50..df74e7e5599a8 100644 --- a/tools/nuget/generate_nuspec_for_native_nuget.py +++ b/tools/nuget/generate_nuspec_for_native_nuget.py @@ -557,7 +557,7 @@ def generate_files(line_list, args): files_list.append( "' @@ -793,8 +793,10 @@ def generate_files(line_list, args): "" ) - # Some tools to be packaged in nightly build only, should not be released + # Some tools to be packaged in nightly debug build only, should not be released # These are copied to the runtimes folder for convenience of loading with the dlls + # NOTE: nuget gives a spurious error on linux if these aren't in a separate directory to the library so + # we add them to a tools folder for that reason. if ( args.is_release_build.lower() != "true" and args.target_architecture == "x64" @@ -804,7 +806,10 @@ def generate_files(line_list, args): "" ) @@ -817,7 +822,10 @@ def generate_files(line_list, args): "" ) @@ -871,7 +879,6 @@ def generate_files(line_list, args): os.system(copy_command + " " + source_props + " " + target_props) files_list.append("') if not is_snpe_package and not is_qnn_package: - files_list.append("') files_list.append("') # Process targets file @@ -890,7 +897,6 @@ def generate_files(line_list, args): os.system(copy_command + " " + source_targets + " " + target_targets) files_list.append("') if not is_snpe_package and not is_qnn_package: - files_list.append("') files_list.append("') # Process xamarin targets files