diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt index daacd221caa93..5555fa692eae8 100644 --- a/cmake/CMakeLists.txt +++ b/cmake/CMakeLists.txt @@ -551,7 +551,7 @@ if(NOT WIN32 AND NOT CMAKE_SYSTEM_NAME STREQUAL "Android") endif() find_package(Patch) -if (WIN32 AND NOT Patch_FOUND) +if (CMAKE_HOST_WIN32 AND NOT Patch_FOUND) # work around CI machines missing patch from the git install by falling back to the binary in this repo. # replicate what happens in https://github.com/Kitware/CMake/blob/master/Modules/FindPatch.cmake but without # the hardcoded suffixes in the path to the patch binary. @@ -1040,7 +1040,7 @@ function(onnxruntime_set_compile_flags target_name) # Enable warning target_compile_options(${target_name} PRIVATE "$<$:SHELL:--compiler-options -Wall>" "$<$>:-Wall>") target_compile_options(${target_name} PRIVATE "$<$>:-Wextra>") - if (CMAKE_CXX_COMPILER_ID STREQUAL "Clang" OR CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang") + if (CMAKE_CXX_COMPILER_ID STREQUAL "Clang" OR CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang" OR CMAKE_CXX_COMPILER_ID STREQUAL "IBMClang") #external/protobuf/src/google/protobuf/arena.h:445:18: error: unused parameter 'p' target_compile_options(${target_name} PRIVATE "-Wno-unused-parameter") endif() @@ -1140,6 +1140,13 @@ endfunction() function(onnxruntime_add_shared_library target_name) add_library(${target_name} SHARED ${ARGN}) onnxruntime_configure_target(${target_name}) + if(WIN32) + target_compile_definitions(${target_name} PRIVATE VER_MAJOR=${VERSION_MAJOR_PART}) + target_compile_definitions(${target_name} PRIVATE VER_MINOR=${VERSION_MINOR_PART}) + target_compile_definitions(${target_name} PRIVATE VER_BUILD=${VERSION_BUILD_PART}) + target_compile_definitions(${target_name} PRIVATE VER_PRIVATE=${VERSION_PRIVATE_PART}) + target_compile_definitions(${target_name} PRIVATE VER_STRING=\"${VERSION_STRING}\") + endif() endfunction() function(onnxruntime_add_static_library target_name) @@ -1154,6 +1161,13 @@ function(onnxruntime_add_shared_library_module target_name) else() #On Windows, this target shouldn't generate an import lib, but I don't know how to disable it. add_library(${target_name} MODULE ${ARGN}) + if(WIN32) + target_compile_definitions(${target_name} PRIVATE VER_MAJOR=${VERSION_MAJOR_PART}) + target_compile_definitions(${target_name} PRIVATE VER_MINOR=${VERSION_MINOR_PART}) + target_compile_definitions(${target_name} PRIVATE VER_BUILD=${VERSION_BUILD_PART}) + target_compile_definitions(${target_name} PRIVATE VER_PRIVATE=${VERSION_PRIVATE_PART}) + target_compile_definitions(${target_name} PRIVATE VER_STRING=\"${VERSION_STRING}\") + endif() endif() onnxruntime_configure_target(${target_name}) @@ -1488,9 +1502,6 @@ if (onnxruntime_USE_CUDA) endif() if (onnxruntime_USE_MIGRAPHX) - if (WIN32) - message(FATAL_ERROR "MIGraphX does not support build in Windows!") - endif() set(AMD_MIGRAPHX_HOME ${onnxruntime_MIGRAPHX_HOME}) endif() @@ -1560,7 +1571,7 @@ if (UNIX OR onnxruntime_USE_NCCL) if (onnxruntime_USE_NCCL) if (onnxruntime_USE_CUDA) set(NCCL_LIBNAME "nccl") - elseif (onnxruntime_USE_ROCM) + elseif (onnxruntime_USE_ROCM OR onnxruntime_USE_MIGRAPHX) set(NCCL_LIBNAME "rccl") endif() find_path(NCCL_INCLUDE_DIR @@ -1639,6 +1650,14 @@ set(VERSION_MINOR_PART 0 CACHE STRING "Second part of numeric file/product ver set(VERSION_BUILD_PART 0 CACHE STRING "Third part of numeric file/product version.") set(VERSION_PRIVATE_PART 0 CACHE STRING "Fourth part of numeric file/product version.") set(VERSION_STRING "Internal Build" CACHE STRING "String representation of file/product version.") +if(VERSION_MAJOR_PART STREQUAL "0" AND VERSION_MINOR_PART STREQUAL "0" AND VERSION_BUILD_PART STREQUAL "0" AND VERSION_PRIVATE_PART STREQUAL "0") + string(REPLACE "." ";" ORT_VERSION_STRING_LIST ${ORT_VERSION}) + list(GET ORT_VERSION_STRING_LIST 0 VERSION_MAJOR_PART) + list(GET ORT_VERSION_STRING_LIST 1 VERSION_MINOR_PART) + list(GET ORT_VERSION_STRING_LIST 2 VERSION_BUILD_PART) + set(VERSION_STRING ORT_VERSION) +endif() + if (WIN32) list(APPEND onnxruntime_EXTERNAL_LIBRARIES ${SYS_PATH_LIB}) diff --git a/cmake/external/onnxruntime_external_deps.cmake b/cmake/external/onnxruntime_external_deps.cmake index a29f89ea8289a..5eb9cf2fdce0f 100644 --- a/cmake/external/onnxruntime_external_deps.cmake +++ b/cmake/external/onnxruntime_external_deps.cmake @@ -47,6 +47,9 @@ if (onnxruntime_BUILD_UNIT_TESTS) if (CMAKE_SYSTEM_NAME STREQUAL "Emscripten") set(gtest_disable_pthreads ON) endif() + if (${CMAKE_SYSTEM_NAME} MATCHES "AIX") + set(gtest_disable_pthreads ON CACHE BOOL "gtest_disable_pthreads" FORCE) + endif() set(INSTALL_GTEST OFF CACHE BOOL "" FORCE) if (IOS OR ANDROID) # on mobile platforms the absl flags class dumps the flag names (assumably for binary size), which breaks passing diff --git a/cmake/onnxruntime.cmake b/cmake/onnxruntime.cmake index ec98047750a91..21ae0947f3788 100644 --- a/cmake/onnxruntime.cmake +++ b/cmake/onnxruntime.cmake @@ -57,6 +57,7 @@ foreach(f ${ONNXRUNTIME_PROVIDER_NAMES}) list(APPEND SYMBOL_FILES "${ONNXRUNTIME_ROOT}/core/providers/${f}/symbols.txt") endforeach() +if(NOT ${CMAKE_SYSTEM_NAME} MATCHES "AIX") add_custom_command(OUTPUT ${SYMBOL_FILE} ${CMAKE_CURRENT_BINARY_DIR}/generated_source.c COMMAND ${Python_EXECUTABLE} "${REPO_ROOT}/tools/ci_build/gen_def.py" --version_file "${ONNXRUNTIME_ROOT}/../VERSION_NUMBER" --src_root "${ONNXRUNTIME_ROOT}" @@ -66,6 +67,7 @@ add_custom_command(OUTPUT ${SYMBOL_FILE} ${CMAKE_CURRENT_BINARY_DIR}/generated_s WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}) add_custom_target(onnxruntime_generate_def ALL DEPENDS ${SYMBOL_FILE} ${CMAKE_CURRENT_BINARY_DIR}/generated_source.c) +endif() if(WIN32) onnxruntime_add_shared_library(onnxruntime ${SYMBOL_FILE} @@ -95,30 +97,33 @@ elseif(onnxruntime_BUILD_APPLE_FRAMEWORK) FRAMEWORK TRUE FRAMEWORK_VERSION A MACOSX_FRAMEWORK_INFO_PLIST ${INFO_PLIST_PATH} - SOVERSION ${ORT_VERSION} # Note: The PUBLIC_HEADER and VERSION properties for the 'onnxruntime' target will be set later in this file. ) else() - onnxruntime_add_shared_library(onnxruntime ${CMAKE_CURRENT_BINARY_DIR}/generated_source.c) + if(${CMAKE_SYSTEM_NAME} MATCHES "AIX") + onnxruntime_add_shared_library(onnxruntime ${ONNXRUNTIME_ROOT}/core/session/onnxruntime_c_api.cc) + else() + onnxruntime_add_shared_library(onnxruntime ${CMAKE_CURRENT_BINARY_DIR}/generated_source.c ) + endif() if (onnxruntime_USE_CUDA) set_property(TARGET onnxruntime APPEND_STRING PROPERTY LINK_FLAGS " -Xlinker -rpath=\\$ORIGIN") endif() endif() -add_dependencies(onnxruntime onnxruntime_generate_def ${onnxruntime_EXTERNAL_DEPENDENCIES}) +if(${CMAKE_SYSTEM_NAME} MATCHES "AIX") + add_dependencies(onnxruntime ${onnxruntime_EXTERNAL_DEPENDENCIES}) +else() + add_dependencies(onnxruntime onnxruntime_generate_def ${onnxruntime_EXTERNAL_DEPENDENCIES}) +endif() target_include_directories(onnxruntime PRIVATE ${ONNXRUNTIME_ROOT} PUBLIC "$") -target_compile_definitions(onnxruntime PRIVATE VER_MAJOR=${VERSION_MAJOR_PART}) -target_compile_definitions(onnxruntime PRIVATE VER_MINOR=${VERSION_MINOR_PART}) -target_compile_definitions(onnxruntime PRIVATE VER_BUILD=${VERSION_BUILD_PART}) -target_compile_definitions(onnxruntime PRIVATE VER_PRIVATE=${VERSION_PRIVATE_PART}) -target_compile_definitions(onnxruntime PRIVATE VER_STRING=\"${VERSION_STRING}\") + target_compile_definitions(onnxruntime PRIVATE FILE_NAME=\"onnxruntime.dll\") if(UNIX) if (APPLE) set(ONNXRUNTIME_SO_LINK_FLAG " -Xlinker -dead_strip") - else() + elseif(NOT ${CMAKE_SYSTEM_NAME} MATCHES "AIX") set(ONNXRUNTIME_SO_LINK_FLAG " -Xlinker --version-script=${SYMBOL_FILE} -Xlinker --no-undefined -Xlinker --gc-sections -z noexecstack") endif() else() @@ -130,7 +135,6 @@ if (NOT WIN32) set(ONNXRUNTIME_SO_LINK_FLAG " -Wl,-exported_symbols_list,${SYMBOL_FILE}") if (${CMAKE_SYSTEM_NAME} STREQUAL "iOS") set_target_properties(onnxruntime PROPERTIES - SOVERSION ${ORT_VERSION} MACOSX_RPATH TRUE INSTALL_RPATH_USE_LINK_PATH FALSE BUILD_WITH_INSTALL_NAME_DIR TRUE @@ -138,7 +142,7 @@ if (NOT WIN32) else() set_target_properties(onnxruntime PROPERTIES INSTALL_RPATH "@loader_path") endif() - elseif (NOT CMAKE_SYSTEM_NAME STREQUAL "Emscripten") + elseif (NOT CMAKE_SYSTEM_NAME STREQUAL "Emscripten" AND NOT ${CMAKE_SYSTEM_NAME} MATCHES "AIX") set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -Wl,-rpath='$ORIGIN'") endif() endif() @@ -206,6 +210,10 @@ set(onnxruntime_INTERNAL_LIBRARIES onnxruntime_flatbuffers ) +if (${CMAKE_SYSTEM_NAME} MATCHES "AIX") + list(APPEND onnxruntime_INTERNAL_LIBRARIES iconv) +endif() + if (onnxruntime_USE_EXTENSIONS) list(APPEND onnxruntime_INTERNAL_LIBRARIES onnxruntime_extensions @@ -222,13 +230,30 @@ target_link_libraries(onnxruntime PRIVATE ) set_property(TARGET onnxruntime APPEND_STRING PROPERTY LINK_FLAGS ${ONNXRUNTIME_SO_LINK_FLAG} ${onnxruntime_DELAYLOAD_FLAGS}) -set_target_properties(onnxruntime PROPERTIES - PUBLIC_HEADER "${ONNXRUNTIME_PUBLIC_HEADERS}" - LINK_DEPENDS ${SYMBOL_FILE} - VERSION ${ORT_VERSION} - FOLDER "ONNXRuntime" -) - +#See: https://cmake.org/cmake/help/latest/prop_tgt/SOVERSION.html +if(NOT APPLE AND NOT WIN32) + if(${CMAKE_SYSTEM_NAME} MATCHES "AIX") + set_target_properties(onnxruntime PROPERTIES + PUBLIC_HEADER "${ONNXRUNTIME_PUBLIC_HEADERS}" + VERSION ${ORT_VERSION} + SOVERSION 1 + FOLDER "ONNXRuntime") + else() + set_target_properties(onnxruntime PROPERTIES + PUBLIC_HEADER "${ONNXRUNTIME_PUBLIC_HEADERS}" + LINK_DEPENDS ${SYMBOL_FILE} + VERSION ${ORT_VERSION} + SOVERSION 1 + FOLDER "ONNXRuntime") + endif() +else() + # Omit the SOVERSION setting in Windows/macOS/iOS/.. build + set_target_properties(onnxruntime PROPERTIES + PUBLIC_HEADER "${ONNXRUNTIME_PUBLIC_HEADERS}" + LINK_DEPENDS ${SYMBOL_FILE} + VERSION ${ORT_VERSION} + FOLDER "ONNXRuntime") +endif() install(TARGETS onnxruntime EXPORT ${PROJECT_NAME}Targets PUBLIC_HEADER DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/onnxruntime diff --git a/cmake/onnxruntime_framework.cmake b/cmake/onnxruntime_framework.cmake index c9bf2ac5c3dc6..43d16abd8fbae 100644 --- a/cmake/onnxruntime_framework.cmake +++ b/cmake/onnxruntime_framework.cmake @@ -108,7 +108,7 @@ add_dependencies(onnxruntime_framework ${onnxruntime_EXTERNAL_DEPENDENCIES}) # For the shared onnxruntime library, this is set in onnxruntime.cmake through CMAKE_SHARED_LINKER_FLAGS # But our test files don't use the shared library so this must be set for them. # For Win32 it generates an absolute path for shared providers based on the location of the executable/onnxruntime.dll -if (UNIX AND NOT APPLE AND NOT onnxruntime_MINIMAL_BUILD AND NOT CMAKE_SYSTEM_NAME STREQUAL "Emscripten") +if (UNIX AND NOT APPLE AND NOT onnxruntime_MINIMAL_BUILD AND NOT CMAKE_SYSTEM_NAME STREQUAL "Emscripten" AND NOT ${CMAKE_SYSTEM_NAME} MATCHES "AIX") set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -Wl,-rpath='$ORIGIN'") endif() diff --git a/cmake/onnxruntime_mlas.cmake b/cmake/onnxruntime_mlas.cmake index df6553e383620..66f4aea606ef5 100644 --- a/cmake/onnxruntime_mlas.cmake +++ b/cmake/onnxruntime_mlas.cmake @@ -427,12 +427,24 @@ else() ) if(COMPILES_P10) check_cxx_source_compiles(" + #ifdef _AIX + #define POWER_10 0x40000 + #define POWER_10_ANDUP (POWER_10) + #include + #define __power_10_andup() (_system_configuration.implementation & POWER_10_ANDUP) + int main() { + bool HasP10 = (__power_10_andup() && __power_mma_version() == MMA_V31); + return 0; + } + #else #include int main() { unsigned long hwcap2 = getauxval(AT_HWCAP2); bool HasP10 = ((hwcap2 & PPC_FEATURE2_MMA) && (hwcap2 & PPC_FEATURE2_ARCH_3_1)); return 0; - }" + } + } + #endif" HAS_P10_RUNTIME ) if (HAS_P10_RUNTIME) diff --git a/cmake/onnxruntime_providers_cpu.cmake b/cmake/onnxruntime_providers_cpu.cmake index b211c02f712bd..d2afe19f36691 100644 --- a/cmake/onnxruntime_providers_cpu.cmake +++ b/cmake/onnxruntime_providers_cpu.cmake @@ -236,11 +236,6 @@ if (NOT onnxruntime_MINIMAL_BUILD AND NOT onnxruntime_EXTENDED_MINIMAL_BUILD set_target_properties(onnxruntime_providers_shared PROPERTIES FOLDER "ONNXRuntime") set_target_properties(onnxruntime_providers_shared PROPERTIES LINKER_LANGUAGE CXX) - target_compile_definitions(onnxruntime_providers_shared PRIVATE VER_MAJOR=${VERSION_MAJOR_PART}) - target_compile_definitions(onnxruntime_providers_shared PRIVATE VER_MINOR=${VERSION_MINOR_PART}) - target_compile_definitions(onnxruntime_providers_shared PRIVATE VER_BUILD=${VERSION_BUILD_PART}) - target_compile_definitions(onnxruntime_providers_shared PRIVATE VER_PRIVATE=${VERSION_PRIVATE_PART}) - target_compile_definitions(onnxruntime_providers_shared PRIVATE VER_STRING=\"${VERSION_STRING}\") target_compile_definitions(onnxruntime_providers_shared PRIVATE FILE_NAME=\"onnxruntime_providers_shared.dll\") @@ -252,7 +247,9 @@ if (NOT onnxruntime_MINIMAL_BUILD AND NOT onnxruntime_EXTENDED_MINIMAL_BUILD if(APPLE) set_property(TARGET onnxruntime_providers_shared APPEND_STRING PROPERTY LINK_FLAGS "-Xlinker -exported_symbols_list ${ONNXRUNTIME_ROOT}/core/providers/shared/exported_symbols.lst") elseif(UNIX) - set_property(TARGET onnxruntime_providers_shared APPEND_STRING PROPERTY LINK_FLAGS "-Xlinker --version-script=${ONNXRUNTIME_ROOT}/core/providers/shared/version_script.lds -Xlinker --gc-sections") + if(NOT ${CMAKE_SYSTEM_NAME} MATCHES "AIX") + set_property(TARGET onnxruntime_providers_shared APPEND_STRING PROPERTY LINK_FLAGS "-Xlinker --version-script=${ONNXRUNTIME_ROOT}/core/providers/shared/version_script.lds -Xlinker --gc-sections") + endif() elseif(WIN32) set_property(TARGET onnxruntime_providers_shared APPEND_STRING PROPERTY LINK_FLAGS "-DEF:${ONNXRUNTIME_ROOT}/core/providers/shared/symbols.def") set(ONNXRUNTIME_PROVIDERS_SHARED onnxruntime_providers_shared) diff --git a/cmake/onnxruntime_providers_migraphx.cmake b/cmake/onnxruntime_providers_migraphx.cmake index 01c4f8b2c8719..d7d83b0ce8d64 100644 --- a/cmake/onnxruntime_providers_migraphx.cmake +++ b/cmake/onnxruntime_providers_migraphx.cmake @@ -19,23 +19,25 @@ endif() # Add search paths for default rocm installation - list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hcc /opt/rocm/hip /opt/rocm) + list(APPEND CMAKE_PREFIX_PATH /opt/rocm/hcc /opt/rocm/hip /opt/rocm $ENV{HIP_PATH}) - find_package(hip) - find_package(migraphx PATHS ${AMD_MIGRAPHX_HOME}) + # Suppress the warning about the small capitals of the package name - Enable when support to CMake 3.27.0 is used + # cmake_policy(SET CMP0144 NEW) - find_package(miopen) - find_package(rocblas) + if(WIN32 AND NOT HIP_PLATFORM) + set(HIP_PLATFORM "amd") + endif() + + find_package(hip REQUIRED) + find_package(migraphx REQUIRED PATHS ${AMD_MIGRAPHX_HOME}) - set(migraphx_libs migraphx::c hip::host MIOpen roc::rocblas) + set(migraphx_libs migraphx::c hip::host) file(GLOB_RECURSE onnxruntime_providers_migraphx_cc_srcs CONFIGURE_DEPENDS "${ONNXRUNTIME_ROOT}/core/providers/migraphx/*.h" "${ONNXRUNTIME_ROOT}/core/providers/migraphx/*.cc" "${ONNXRUNTIME_ROOT}/core/providers/shared_library/*.h" "${ONNXRUNTIME_ROOT}/core/providers/shared_library/*.cc" - "${ONNXRUNTIME_ROOT}/core/providers/rocm/rocm_stream_handle.h" - "${ONNXRUNTIME_ROOT}/core/providers/rocm/rocm_stream_handle.cc" ) source_group(TREE ${ONNXRUNTIME_ROOT}/core FILES ${onnxruntime_providers_migraphx_cc_srcs}) onnxruntime_add_shared_library_module(onnxruntime_providers_migraphx ${onnxruntime_providers_migraphx_cc_srcs}) @@ -46,18 +48,16 @@ set_target_properties(onnxruntime_providers_migraphx PROPERTIES LINKER_LANGUAGE CXX) set_target_properties(onnxruntime_providers_migraphx PROPERTIES FOLDER "ONNXRuntime") target_compile_definitions(onnxruntime_providers_migraphx PRIVATE ONNXIFI_BUILD_LIBRARY=1) - target_compile_options(onnxruntime_providers_migraphx PRIVATE -Wno-error=sign-compare) - set_property(TARGET onnxruntime_providers_migraphx APPEND_STRING PROPERTY COMPILE_FLAGS "-Wno-deprecated-declarations") - set_property(TARGET onnxruntime_providers_migraphx APPEND_STRING PROPERTY LINK_FLAGS "-Xlinker --version-script=${ONNXRUNTIME_ROOT}/core/providers/migraphx/version_script.lds -Xlinker --gc-sections") - target_link_libraries(onnxruntime_providers_migraphx PRIVATE nsync::nsync_cpp) - - include(CheckLibraryExists) - check_library_exists(migraphx::c "migraphx_program_run_async" "/opt/rocm/migraphx/lib" HAS_STREAM_SYNC) - if(HAS_STREAM_SYNC) - target_compile_definitions(onnxruntime_providers_migraphx PRIVATE -DMIGRAPHX_STREAM_SYNC) - message(STATUS "MIGRAPHX GPU STREAM SYNC is ENABLED") + if(MSVC) + set_property(TARGET onnxruntime_providers_migraphx APPEND_STRING PROPERTY LINK_FLAGS /DEF:${ONNXRUNTIME_ROOT}/core/providers/migraphx/symbols.def) + target_link_libraries(onnxruntime_providers_migraphx PRIVATE ws2_32) else() - message(STATUS "MIGRAPHX GPU STREAM SYNC is DISABLED") + target_compile_options(onnxruntime_providers_migraphx PRIVATE -Wno-error=sign-compare) + set_property(TARGET onnxruntime_providers_migraphx APPEND_STRING PROPERTY COMPILE_FLAGS "-Wno-deprecated-declarations") + endif() + if(UNIX) + set_property(TARGET onnxruntime_providers_migraphx APPEND_STRING PROPERTY LINK_FLAGS "-Xlinker --version-script=${ONNXRUNTIME_ROOT}/core/providers/migraphx/version_script.lds -Xlinker --gc-sections") + target_link_libraries(onnxruntime_providers_migraphx PRIVATE nsync::nsync_cpp stdc++fs) endif() if (onnxruntime_ENABLE_TRAINING_OPS) @@ -68,8 +68,16 @@ endif() endif() - install(TARGETS onnxruntime_providers_migraphx - ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} - LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} - RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} - ) + if(CMAKE_SYSTEM_NAME STREQUAL "Windows") + install(TARGETS onnxruntime_providers_migraphx + ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} + LIBRARY DESTINATION ${CMAKE_INSTALL_BINDIR} + RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} + ) + else() + install(TARGETS onnxruntime_providers_migraphx + ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} + LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} + RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} + ) + endif() diff --git a/cmake/onnxruntime_providers_openvino.cmake b/cmake/onnxruntime_providers_openvino.cmake index 5876b2b5c448b..d738e29101cfe 100644 --- a/cmake/onnxruntime_providers_openvino.cmake +++ b/cmake/onnxruntime_providers_openvino.cmake @@ -45,11 +45,6 @@ target_include_directories(onnxruntime_providers_openvino SYSTEM PUBLIC ${ONNXRUNTIME_ROOT} ${CMAKE_CURRENT_BINARY_DIR} ${eigen_INCLUDE_DIRS} ${OpenVINO_INCLUDE_DIR} ${OPENVINO_INCLUDE_DIR_LIST} ${PYTHON_INCLUDE_DIRS} $ENV{OPENCL_INCS} $ENV{OPENCL_INCS}/../../cl_headers/) target_link_libraries(onnxruntime_providers_openvino ${ONNXRUNTIME_PROVIDERS_SHARED} Boost::mp11 ${OPENVINO_LIB_LIST} ${ABSEIL_LIBS}) - target_compile_definitions(onnxruntime_providers_openvino PRIVATE VER_MAJOR=${VERSION_MAJOR_PART}) - target_compile_definitions(onnxruntime_providers_openvino PRIVATE VER_MINOR=${VERSION_MINOR_PART}) - target_compile_definitions(onnxruntime_providers_openvino PRIVATE VER_BUILD=${VERSION_BUILD_PART}) - target_compile_definitions(onnxruntime_providers_openvino PRIVATE VER_PRIVATE=${VERSION_PRIVATE_PART}) - target_compile_definitions(onnxruntime_providers_openvino PRIVATE VER_STRING=\"${VERSION_STRING}\") target_compile_definitions(onnxruntime_providers_openvino PRIVATE FILE_NAME=\"onnxruntime_providers_openvino.dll\") if(MSVC) diff --git a/cmake/onnxruntime_unittests.cmake b/cmake/onnxruntime_unittests.cmake index 711a9f77f9094..0159c35d1941b 100644 --- a/cmake/onnxruntime_unittests.cmake +++ b/cmake/onnxruntime_unittests.cmake @@ -1225,6 +1225,9 @@ if (NOT onnxruntime_ENABLE_TRAINING_TORCH_INTEROP) if (CMAKE_SYSTEM_NAME STREQUAL "Android") list(APPEND onnxruntime_perf_test_libs ${android_shared_libs}) endif() + if (${CMAKE_SYSTEM_NAME} MATCHES "AIX") + list(APPEND onnxruntime_perf_test_libs onnxruntime_graph onnxruntime_session onnxruntime_providers onnxruntime_framework onnxruntime_util onnxruntime_mlas onnxruntime_optimizer onnxruntime_flatbuffers iconv re2 gtest absl_failure_signal_handler absl_examine_stack absl_flags_parse absl_flags_usage absl_flags_usage_internal) + endif() target_link_libraries(onnxruntime_perf_test PRIVATE ${onnxruntime_perf_test_libs} Threads::Threads) if(WIN32) target_link_libraries(onnxruntime_perf_test PRIVATE debug dbghelp advapi32) @@ -1275,6 +1278,10 @@ if (NOT onnxruntime_ENABLE_TRAINING_TORCH_INTEROP) list(APPEND onnxruntime_shared_lib_test_LIBS ${android_shared_libs}) endif() + if (${CMAKE_SYSTEM_NAME} MATCHES "AIX") + list(APPEND onnxruntime_shared_lib_test_LIBS onnxruntime_graph onnxruntime_session onnxruntime_providers onnxruntime_framework onnxruntime_util onnxruntime_mlas onnxruntime_optimizer onnxruntime_flatbuffers iconv re2) + endif() + AddTest(DYN TARGET onnxruntime_shared_lib_test SOURCES ${onnxruntime_shared_lib_test_SRC} ${onnxruntime_unittest_main_src} @@ -1510,7 +1517,7 @@ if (NOT CMAKE_SYSTEM_NAME STREQUAL "Emscripten") if(UNIX) if (APPLE) set(ONNXRUNTIME_CUSTOM_OP_LIB_LINK_FLAG "-Xlinker -dead_strip") - else() + elseif(NOT ${CMAKE_SYSTEM_NAME} MATCHES "AIX") set(ONNXRUNTIME_CUSTOM_OP_LIB_LINK_FLAG "-Xlinker --version-script=${TEST_SRC_DIR}/testdata/custom_op_library/custom_op_library.lds -Xlinker --no-undefined -Xlinker --gc-sections -z noexecstack") endif() else() @@ -1574,6 +1581,9 @@ if (NOT CMAKE_SYSTEM_NAME STREQUAL "Emscripten") if (onnxruntime_USE_TENSORRT) list(APPEND onnxruntime_customopregistration_test_LIBS ${TENSORRT_LIBRARY_INFER}) endif() + if (${CMAKE_SYSTEM_NAME} MATCHES "AIX") + list(APPEND onnxruntime_customopregistration_test_LIBS onnxruntime_graph onnxruntime_session onnxruntime_providers onnxruntime_framework onnxruntime_util onnxruntime_mlas onnxruntime_optimizer onnxruntime_flatbuffers iconv re2 libprotobuf-lite onnx_proto nsync_cpp) + endif() AddTest(DYN TARGET onnxruntime_customopregistration_test SOURCES ${onnxruntime_customopregistration_test_SRC} ${onnxruntime_unittest_main_src} @@ -1608,7 +1618,7 @@ if (NOT CMAKE_SYSTEM_NAME STREQUAL "Emscripten" AND (NOT onnxruntime_MINIMAL_BUI if(UNIX) if (APPLE) set(ONNXRUNTIME_CUSTOM_OP_INVALID_LIB_LINK_FLAG "-Xlinker -dead_strip") - else() + elseif (NOT ${CMAKE_SYSTEM_NAME} MATCHES "AIX") string(CONCAT ONNXRUNTIME_CUSTOM_OP_INVALID_LIB_LINK_FLAG "-Xlinker --version-script=${TEST_SRC_DIR}/testdata/custom_op_invalid_library/custom_op_library.lds " "-Xlinker --no-undefined -Xlinker --gc-sections -z noexecstack") @@ -1639,7 +1649,7 @@ if (NOT CMAKE_SYSTEM_NAME STREQUAL "Emscripten" AND (NOT onnxruntime_MINIMAL_BUI if(UNIX) if (APPLE) set(ONNXRUNTIME_CUSTOM_OP_GET_CONST_INPUT_TEST_LIB_LINK_FLAG "-Xlinker -dead_strip") - else() + elseif(NOT ${CMAKE_SYSTEM_NAME} MATCHES "AIX") string(CONCAT ONNXRUNTIME_CUSTOM_OP_GET_CONST_INPUT_TEST_LIB_LINK_FLAG "-Xlinker --version-script=${TEST_SRC_DIR}/testdata/custom_op_get_const_input_test_library/custom_op_lib.lds " "-Xlinker --no-undefined -Xlinker --gc-sections -z noexecstack") @@ -1671,7 +1681,7 @@ if (NOT CMAKE_SYSTEM_NAME STREQUAL "Emscripten" AND (NOT onnxruntime_MINIMAL_BUI if(UNIX) if (APPLE) set(ONNXRUNTIME_CUSTOM_OP_lOCAL_FUNCTION_TEST_LIB_LINK_FLAG "-Xlinker -dead_strip") - else() + elseif(NOT ${CMAKE_SYSTEM_NAME} MATCHES "AIX") string(CONCAT ONNXRUNTIME_CUSTOM_OP_lOCAL_FUNCTION_TEST_LIB_LINK_FLAG "-Xlinker --version-script=${TEST_SRC_DIR}/testdata/custom_op_local_function/custom_op_local_function.lds " "-Xlinker --no-undefined -Xlinker --gc-sections -z noexecstack") @@ -1690,6 +1700,9 @@ if (onnxruntime_BUILD_SHARED_LIB AND NOT CMAKE_SYSTEM_NAME STREQUAL "Emscripten" ${ONNXRUNTIME_LOGGING_APIS_TEST_SRC_DIR}/test_logging_apis.cc) set(onnxruntime_logging_apis_test_LIBS onnxruntime_common onnxruntime_test_utils) + if (${CMAKE_SYSTEM_NAME} MATCHES "AIX") + list(APPEND onnxruntime_logging_apis_test_LIBS onnxruntime_session onnxruntime_util onnxruntime_framework onnxruntime_common onnxruntime_graph onnxruntime_providers onnxruntime_mlas onnxruntime_optimizer onnxruntime_flatbuffers iconv re2 libprotobuf-lite onnx_proto nsync_cpp) + endif() if(NOT WIN32) list(APPEND onnxruntime_logging_apis_test_LIBS nsync::nsync_cpp ${CMAKE_DL_LIBS}) @@ -1753,7 +1766,9 @@ if (NOT onnxruntime_MINIMAL_BUILD AND NOT onnxruntime_EXTENDED_MINIMAL_BUILD if(APPLE) set_property(TARGET test_execution_provider APPEND_STRING PROPERTY LINK_FLAGS "-Xlinker -exported_symbols_list ${REPO_ROOT}/onnxruntime/test/testdata/custom_execution_provider_library/exported_symbols.lst") elseif(UNIX) - set_property(TARGET test_execution_provider APPEND_STRING PROPERTY LINK_FLAGS "-Xlinker --version-script=${REPO_ROOT}/onnxruntime/test/testdata/custom_execution_provider_library/version_script.lds -Xlinker --gc-sections -Xlinker -rpath=\\$ORIGIN") + if (NOT ${CMAKE_SYSTEM_NAME} MATCHES "AIX") + set_property(TARGET test_execution_provider APPEND_STRING PROPERTY LINK_FLAGS "-Xlinker --version-script=${REPO_ROOT}/onnxruntime/test/testdata/custom_execution_provider_library/version_script.lds -Xlinker --gc-sections -Xlinker -rpath=\\$ORIGIN") + endif() elseif(WIN32) set_property(TARGET test_execution_provider APPEND_STRING PROPERTY LINK_FLAGS "-DEF:${REPO_ROOT}/onnxruntime/test/testdata/custom_execution_provider_library/symbols.def") else() diff --git a/cmake/patches/flatbuffers/flatbuffers.patch b/cmake/patches/flatbuffers/flatbuffers.patch index fbe8db37ecb0e..9fb58e301bba8 100644 --- a/cmake/patches/flatbuffers/flatbuffers.patch +++ b/cmake/patches/flatbuffers/flatbuffers.patch @@ -10,3 +10,21 @@ index 3987eac9..5e5462f1 100644 + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${FLATBUFFERS_CXX_FLAGS} -Wno-error=stringop-overflow") endif() message(STATUS "CMAKE_CXX_FLAGS: ${CMAKE_CXX_FLAGS}") +diff --git a/include/flatbuffers/flatbuffers.h b/include/flatbuffers/flatbuffers.h +index bc828a31..3d3effe8 100644 +--- a/include/flatbuffers/flatbuffers.h ++++ b/include/flatbuffers/flatbuffers.h +@@ -213,7 +213,12 @@ inline const char * const *ElementaryTypeNames() { + // We're explicitly defining the signedness since the signedness of integer + // bitfields is otherwise implementation-defined and causes warnings on older + // GCC compilers. +-struct TypeCode { ++ ++struct ++#if defined(_AIX) && defined(__clang__) ++__attribute__((packed)) ++#endif ++TypeCode { + // ElementaryType + unsigned short base_type : 4; + // Either vector (in table) or array (in struct) diff --git a/cmake/winml.cmake b/cmake/winml.cmake index d74250b962628..ff6b71217ad87 100644 --- a/cmake/winml.cmake +++ b/cmake/winml.cmake @@ -718,11 +718,6 @@ target_compile_definitions(winml_dll PRIVATE ONNX_ML) target_compile_definitions(winml_dll PRIVATE LOTUS_LOG_THRESHOLD=2) target_compile_definitions(winml_dll PRIVATE LOTUS_ENABLE_STDERR_LOGGING) target_compile_definitions(winml_dll PRIVATE PLATFORM_WINDOWS) -target_compile_definitions(winml_dll PRIVATE VER_MAJOR=${VERSION_MAJOR_PART}) -target_compile_definitions(winml_dll PRIVATE VER_MINOR=${VERSION_MINOR_PART}) -target_compile_definitions(winml_dll PRIVATE VER_BUILD=${VERSION_BUILD_PART}) -target_compile_definitions(winml_dll PRIVATE VER_PRIVATE=${VERSION_PRIVATE_PART}) -target_compile_definitions(winml_dll PRIVATE VER_STRING=\"${VERSION_STRING}\") target_compile_definitions(winml_dll PRIVATE BINARY_NAME=\"${BINARY_NAME}\") if (onnxruntime_WINML_NAMESPACE_OVERRIDE STREQUAL "Windows") diff --git a/include/onnxruntime/core/graph/basic_types.h b/include/onnxruntime/core/graph/basic_types.h index 36984d0405bbd..cdd5e4c1e571b 100644 --- a/include/onnxruntime/core/graph/basic_types.h +++ b/include/onnxruntime/core/graph/basic_types.h @@ -19,6 +19,8 @@ class TensorProto; class SparseTensorProto; class TypeProto; class AttributeProto; +class FunctionProto; +class OperatorSetIdProto; // define types that would come from the ONNX library if we were building against it. #if defined(ORT_MINIMAL_BUILD) using OperatorSetVersion = int; diff --git a/java/build-android.gradle b/java/build-android.gradle index afbad9f03d08d..fd22fa27e8db9 100644 --- a/java/build-android.gradle +++ b/java/build-android.gradle @@ -105,7 +105,7 @@ task sourcesJar(type: Jar) { task javadoc(type: Javadoc) { source = android.sourceSets.main.java.srcDirs - classpath += project.files(android.getBootClasspath().join(File.pathSeparator)) + classpath += project.files(android.getBootClasspath()) } task javadocJar(type: Jar, dependsOn: javadoc) { diff --git a/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/matmul_packed_webgpu.ts b/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/matmul_packed_webgpu.ts index 29c7941e6bd30..9b37247167bab 100644 --- a/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/matmul_packed_webgpu.ts +++ b/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/matmul_packed_webgpu.ts @@ -328,13 +328,6 @@ fn main(@builtin(local_invocation_id) localId : vec3, var kStart = ${splitK ? `i32(globalId.z) * ${splitedDimInner}` : '0'}; var acc : array, rowPerThread>; - - // Without this initialization strange values show up in acc. - for (var innerRow = 0; innerRow < rowPerThread; innerRow = innerRow + 1) { - for (var innerCol = 0; innerCol < colPerThread; innerCol = innerCol + 1) { - acc[innerRow][innerCol] = 0.0; - } - } ${matmulSnippet} } `; diff --git a/onnxruntime/contrib_ops/cpu/murmur_hash3.cc b/onnxruntime/contrib_ops/cpu/murmur_hash3.cc index ec504d215920f..000c590f32616 100644 --- a/onnxruntime/contrib_ops/cpu/murmur_hash3.cc +++ b/onnxruntime/contrib_ops/cpu/murmur_hash3.cc @@ -8,6 +8,8 @@ /* Modifications Copyright (c) Microsoft. */ #include "contrib_ops/cpu/murmur_hash3.h" +#include +#include // Platform-specific functions and macros @@ -60,11 +62,31 @@ inline uint64_t rotl64(uint64_t x, int8_t r) { // handle aligned reads, do the conversion here FORCE_INLINE uint32_t getblock(const uint32_t* p, int i) { - return p[i]; + if constexpr (onnxruntime::endian::native == onnxruntime::endian::little) { + return p[i]; + } else { + const uint8_t* c = (const uint8_t*)&p[i]; + return (uint32_t)c[0] | + (uint32_t)c[1] << 8 | + (uint32_t)c[2] << 16 | + (uint32_t)c[3] << 24; + } } FORCE_INLINE uint64_t getblock(const uint64_t* p, int i) { - return p[i]; + if constexpr (onnxruntime::endian::native == onnxruntime::endian::little) { + return p[i]; + } else { + const uint8_t* c = (const uint8_t*)&p[i]; + return (uint64_t)c[0] | + (uint64_t)c[1] << 8 | + (uint64_t)c[2] << 16 | + (uint64_t)c[3] << 24 | + (uint64_t)c[4] << 32 | + (uint64_t)c[5] << 40 | + (uint64_t)c[6] << 48 | + (uint64_t)c[7] << 56; + } } //----------------------------------------------------------------------------- @@ -204,13 +226,35 @@ Status MurmurHash3::Compute(OpKernelContext* ctx) const { int input_num_bytes = static_cast(input_element_bytes); ORT_ENFORCE(input_num_bytes % 4 == 0); const auto input_end = input + input_count * input_num_bytes; - while (input != input_end) { - MurmurHash3_x86_32(input, - input_num_bytes, - seed_, - output); - input += input_num_bytes; - ++output; + + if constexpr (onnxruntime::endian::native == onnxruntime::endian::little) { + while (input != input_end) { + MurmurHash3_x86_32(input, + input_num_bytes, + seed_, + output); + input += input_num_bytes; + ++output; + } + } else { + // Big endian platform require byte swapping. + auto raw_data = std::make_unique(input_num_bytes); + char* raw_data_ptr = raw_data.get(); + while (input != input_end) { + memcpy(raw_data_ptr, input, input_num_bytes); + char* start_byte = raw_data_ptr; + char* end_byte = start_byte + input_num_bytes - 1; + for (size_t count = 0; count < static_cast(input_num_bytes / 2); ++count) { + std::swap(*start_byte++, *end_byte--); + } + + MurmurHash3_x86_32(raw_data_ptr, + input_num_bytes, + seed_, + output); + input += input_num_bytes; + ++output; + } } } return Status::OK(); diff --git a/onnxruntime/contrib_ops/cpu/quantization/matmul_nbits_impl.cc b/onnxruntime/contrib_ops/cpu/quantization/matmul_nbits_impl.cc index 7e343d85f4048..b28f3758f89b5 100644 --- a/onnxruntime/contrib_ops/cpu/quantization/matmul_nbits_impl.cc +++ b/onnxruntime/contrib_ops/cpu/quantization/matmul_nbits_impl.cc @@ -40,6 +40,13 @@ void Dequantize4BitsKernelReOrder( } T* output_i = output + out_y * out_cols + out_x; uint32_t quant_value = *(reinterpret_cast(quant_data + element_offset / 2)); + if constexpr (onnxruntime::endian::native == onnxruntime::endian::big) { + const uint8_t* c = (const uint8_t*)(&quant_value); + quant_value = (uint32_t)c[0] | + (uint32_t)c[1] << 8 | + (uint32_t)c[2] << 16 | + (uint32_t)c[3] << 24; + } const int remain_x = std::min(8, out_cols - out_x); const int32_t* reorder_idx_with_off = reorder_idx + kb_idx * block_size + ((threadIdx_x * 8) & (block_size - 1)); for (int i = 0; i < remain_x; i++) { diff --git a/onnxruntime/core/framework/tensorprotoutils.cc b/onnxruntime/core/framework/tensorprotoutils.cc index e8086877a9159..4ecd61962d797 100644 --- a/onnxruntime/core/framework/tensorprotoutils.cc +++ b/onnxruntime/core/framework/tensorprotoutils.cc @@ -6,6 +6,7 @@ #include #include #include +#include #include #if defined(__wasm__) #include @@ -260,7 +261,89 @@ Status TensorProtoToOrtValueImpl(const Env& env, const std::filesystem::path& mo namespace utils { +void SetRawDataInTensorProto(ONNX_NAMESPACE::TensorProto& tensor_proto, std::string&& param) { + tensor_proto.set_raw_data(std::move(param)); +} + +void ConvertRawDataInTensorProto(TensorProto* tensor) { + size_t element_size = 1; + char* bytes = NULL; + size_t num_elements = 0; + switch (tensor->data_type()) { + case TensorProto_DataType_FLOAT: + bytes = reinterpret_cast(tensor->mutable_float_data()->mutable_data()); + num_elements = tensor->float_data_size(); + element_size = sizeof(float); + break; + + case TensorProto_DataType_INT32: + bytes = reinterpret_cast(tensor->mutable_int32_data()->mutable_data()); + num_elements = tensor->int32_data_size(); + element_size = sizeof(int32_t); + break; + + case TensorProto_DataType_UINT32: + bytes = reinterpret_cast(tensor->mutable_int32_data()->mutable_data()); + num_elements = tensor->int32_data_size(); + element_size = sizeof(uint32_t); + break; + + case TensorProto_DataType_UINT8: + case TensorProto_DataType_INT8: + bytes = reinterpret_cast(tensor->mutable_int32_data()->mutable_data()); + num_elements = tensor->int32_data_size(); + element_size = sizeof(uint8_t); + break; + + case TensorProto_DataType_UINT16: + case TensorProto_DataType_INT16: + case TensorProto_DataType_FLOAT16: + case TensorProto_DataType_BFLOAT16: + bytes = reinterpret_cast(tensor->mutable_int32_data()->mutable_data()); + num_elements = tensor->int32_data_size(); + element_size = sizeof(uint16_t); + break; + + case TensorProto_DataType_UINT64: + bytes = reinterpret_cast(tensor->mutable_uint64_data()->mutable_data()); + num_elements = tensor->uint64_data_size(); + element_size = sizeof(uint64_t); + break; + + case TensorProto_DataType_DOUBLE: + bytes = reinterpret_cast(tensor->mutable_double_data()->mutable_data()); + num_elements = tensor->double_data_size(); + element_size = sizeof(double); + break; + + case TensorProto_DataType_INT64: + bytes = reinterpret_cast(tensor->mutable_int64_data()->mutable_data()); + num_elements = tensor->int64_data_size(); + element_size = sizeof(int64_t); + break; + + case TensorProto_DataType_COMPLEX64: + bytes = reinterpret_cast(tensor->mutable_float_data()->mutable_data()); + num_elements = tensor->float_data_size(); + element_size = sizeof(float); + break; + } + if (tensor->has_raw_data()) { + num_elements = (tensor->raw_data().size()) / element_size; + bytes = const_cast(tensor->mutable_raw_data()->c_str()); + } + for (size_t i = 0; i < num_elements; ++i) { + char* start_byte = bytes + i * element_size; + char* end_byte = start_byte + element_size - 1; + for (size_t count = 0; count < element_size / 2; ++count) { + std::swap(*start_byte++, *end_byte--); + } + } + return; +} + #if !defined(ORT_MINIMAL_BUILD) + static Status UnpackTensorWithExternalDataImpl(const ONNX_NAMESPACE::TensorProto& tensor, const std::filesystem::path& tensor_proto_dir, size_t expected_num_elements, size_t element_size, @@ -1159,11 +1242,6 @@ ONNXTensorElementDataType GetTensorElementType(const ONNX_NAMESPACE::TensorProto } ONNX_NAMESPACE::TensorProto TensorToTensorProto(const Tensor& tensor, const std::string& tensor_proto_name) { - // Given we are using the raw_data field in the protobuf, this will work only for little-endian format. - if constexpr (endian::native != endian::little) { - ORT_THROW("Big endian not supported"); - } - // Set name, dimensions, type, and data of the TensorProto. ONNX_NAMESPACE::TensorProto tensor_proto; @@ -1182,7 +1260,7 @@ ONNX_NAMESPACE::TensorProto TensorToTensorProto(const Tensor& tensor, const std: *mutable_string_data->Add() = *f; } } else { - tensor_proto.set_raw_data(tensor.DataRaw(), tensor.SizeInBytes()); + utils::SetRawDataInTensorProto(tensor_proto, tensor.DataRaw(), tensor.SizeInBytes()); } return tensor_proto; @@ -1464,8 +1542,7 @@ common::Status SparseTensorProtoToDenseTensorProto(const ONNX_NAMESPACE::SparseT ORT_RETURN_IF_ERROR(status); } - dense.set_raw_data(std::move(dense_data_storage)); - + utils::SetRawDataInTensorProto(dense, std::move(dense_data_storage)); } else { // No request for std::string status = ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Unsupported sparse tensor data type of ", @@ -1510,7 +1587,17 @@ static void SetIndices(gsl::span gathered_indices, std::string& raw_ind } else { auto* dst = ind_dest + dest_index; T v = static_cast(src_index); - memcpy(dst, &v, sizeof(T)); + if constexpr (endian::native != endian::little) { + auto src = gsl::make_span(static_cast( + reinterpret_cast(&v)), + sizeof(T)); + auto dest = gsl::make_span(static_cast( + reinterpret_cast(dst)), + sizeof(T)); + onnxruntime::utils::SwapByteOrderCopy(sizeof(T), src, dest); + } else { + memcpy(dst, &v, sizeof(T)); + } } ++dest_index; } @@ -1561,7 +1648,7 @@ static void SparsifyGeneric(const void* dense_raw_data, size_t n_dense_elements, } } else { indices.set_data_type(ONNX_NAMESPACE::TensorProto_DataType_INT8); - indices.set_raw_data(std::string()); + utils::SetRawDataInTensorProto(indices, std::string()); } nnz = gathered_indices.size(); } diff --git a/onnxruntime/core/framework/tensorprotoutils.h b/onnxruntime/core/framework/tensorprotoutils.h index a66caf1ace33b..aabfc0487f3e0 100644 --- a/onnxruntime/core/framework/tensorprotoutils.h +++ b/onnxruntime/core/framework/tensorprotoutils.h @@ -5,6 +5,7 @@ #include #include +#include #include #ifndef SHARED_PROVIDER @@ -19,6 +20,46 @@ #include "core/graph/onnx_protobuf.h" #include "core/platform/env.h" +namespace onnxruntime { +namespace utils { +/** + * This function is used to convert the endianess of Tensor data. + * Mostly, will be used in big endian system to support the model file + * generated on little endian system. + * @param initializer given initializer tensor + * @returns None + */ +void ConvertRawDataInTensorProto(ONNX_NAMESPACE::TensorProto* initializer); + +/** + * Wrapper function for set_raw_data. + * First calls the set_raw_data and then calls ConvertRawDataInTensorProto + * under big endian system. + * @param tensor_proto given initializer tensor + * @param raw_data source raw_data pointer + * @param raw_data_len length of raw_data + * @returns None + */ +template +void SetRawDataInTensorProto(ONNX_NAMESPACE::TensorProto& tensor_proto, T1* raw_data, T2 raw_data_len) { + using namespace ONNX_NAMESPACE; + tensor_proto.set_raw_data(raw_data, raw_data_len); + if constexpr (endian::native != endian::little) { + utils::ConvertRawDataInTensorProto((ONNX_NAMESPACE::TensorProto*)&tensor_proto); + } +} + +/** + * Overload Wrapper function for set_raw_data handling string object. + * Forward the string object to set_raw_data. + * @param tensor_proto given initializer tensor + * @param param string object reference + * @returns None + */ +void SetRawDataInTensorProto(ONNX_NAMESPACE::TensorProto& tensor_proto, std::string&& param); +} // namespace utils +} // namespace onnxruntime + namespace ONNX_NAMESPACE { class TensorProto; class TensorShapeProto; diff --git a/onnxruntime/core/graph/graph.cc b/onnxruntime/core/graph/graph.cc index f73a50db7aaa4..442a0db933d65 100644 --- a/onnxruntime/core/graph/graph.cc +++ b/onnxruntime/core/graph/graph.cc @@ -1199,6 +1199,15 @@ Graph::Graph(const Model& owning_model, const gsl::not_null tensor{graph_proto_->add_initializer()}; auto status = utils::ConstantNodeProtoToTensorProto(node, model_path, *tensor); + if constexpr (endian::native != endian::little) { + const AttributeProto& attrib = node.attribute(0); + if (attrib.type() == AttributeProto_AttributeType_SPARSE_TENSOR) { + const TensorProto& sparse_values = node.attribute(0).sparse_tensor().values(); + if ((!(sparse_values.has_raw_data())) && tensor->has_raw_data()) { + onnxruntime::utils::ConvertRawDataInTensorProto(tensor); + } + } + } ORT_ENFORCE(status.IsOK(), status.ToString()); // Ensure initializers are also graph inputs. if (ir_version_ < 4) { @@ -3716,6 +3725,12 @@ SaveInputsOutputsToOrtFormat(flatbuffers::FlatBufferBuilder& builder, const std: common::Status Graph::SaveToOrtFormat(flatbuffers::FlatBufferBuilder& builder, flatbuffers::Offset& fbs_graph) const { + if constexpr (endian::native != endian::little) { + auto& tens = GetAllInitializedTensors(); + for (auto& [name, tensor_p] : tens) { + utils::ConvertRawDataInTensorProto(const_cast(tensor_p)); + } + } auto inputs = SaveInputsOutputsToOrtFormat(builder, graph_inputs_including_initializers_); auto outputs = SaveInputsOutputsToOrtFormat(builder, graph_outputs_); diff --git a/onnxruntime/core/mlas/inc/mlas_q4.h b/onnxruntime/core/mlas/inc/mlas_q4.h index 898fb23cf3e4f..aec14070ffd55 100644 --- a/onnxruntime/core/mlas/inc/mlas_q4.h +++ b/onnxruntime/core/mlas/inc/mlas_q4.h @@ -360,12 +360,12 @@ MlasDequantizeBlockwise( ); /** - * @brief Blockwise 2 bits or 4 bits quantization. After quantization, the weights and zero points - * are packed row-wise. In terms of the qbits type, dst and src have the same shape, and - * scales and zero_points have the same shape. - * columns must be multiple of 8 / qbits. + * @brief Blockwise 4 bits quantization. After quantization, the weights and zero points + * are packed row-wise. If zero_points is null, quantized type is int4 with default + * zero point 0, to align with DQ schema. Otherwise, quantized type is uint4. + * In int4/uint4, dst have the same shape as src, and zero_points have the same shape as scales. * @tparam Tin - * @tparam qbits number of bits used for quantization, 2 or 4 + * @tparam qbits number of bits used for quantization, only 4 is supported * @param src points to the floating point matrix, to be quantized, row major shape [rows, columns] * @param scales points to the scales matrix, row major * @param zero_points points to the zero_points matrix, row major @@ -376,9 +376,10 @@ MlasDequantizeBlockwise( * @param columns * @param quant_block_size number of elements in a quantize block * @param thread_pool + * @return the quantized type is signed. */ template -void +bool MlasQDQQuantizeBlockwise( const Tin* src, Tin* scales, @@ -395,8 +396,17 @@ MlasQDQQuantizeBlockwise( * @brief Transpose blockwise quantized tensors. The src tensors are row major. src weights and zero * points are packed row-wise. The dst tensors are column major. dst weights and zero points * are packed column-wise. + * dst_weights and dst_zero_points are in uint4. + * If src_weights is int4 and has src_zero_points, src_weights and src_zero_points are + * converted to uint4 by adding 8. + * If src_weights is int4 and no src_zero_points, src_weights is converted to uint4 by adding 8. + * src_zero_points is 0 and dst_zero_points is 8. + * If src_weights is uint4 and has src_zero_points, just transpose. + * If src_weights is uint4 and no src_zero_points, caller must allocate dst_zero_points with + * 0 values. Otherwise exception is thrown. * @tparam Tin - * @tparam qbits number of bits used for quantization, 2 or 4 + * @tparam qbits number of bits used for quantization, only 4 is supported + * @tparam signed_quant true when quantized type is signed, false when quantized type is unsigned * @param src_weights points to the quantized matrix, row major, shape [rows, columns] in qbits type. * In uint8_t type, shape is [rows, columns * qbits / 8]. * @param src_scales points to the scales matrix, row major @@ -410,7 +420,7 @@ MlasQDQQuantizeBlockwise( * @param quant_block_size number of elements in a quantize block * @param thread_pool */ -template +template void MlasQDQTransposeBlockwiseQuantized( const uint8_t* src_weights, diff --git a/onnxruntime/core/mlas/lib/platform.cpp b/onnxruntime/core/mlas/lib/platform.cpp index 72eb35c894094..859b7c2f560a4 100644 --- a/onnxruntime/core/mlas/lib/platform.cpp +++ b/onnxruntime/core/mlas/lib/platform.cpp @@ -20,8 +20,15 @@ Module Name: #include #include -#if defined(MLAS_TARGET_POWER) && defined(__linux__) +#if defined(MLAS_TARGET_POWER) +#if defined(__linux__) #include +#elif defined(_AIX) +#define POWER_10 0x40000 +#define POWER_10_ANDUP (POWER_10) +#include +#define __power_10_andup() (_system_configuration.implementation & POWER_10_ANDUP) +#endif #endif #if defined(MLAS_TARGET_ARM64) @@ -554,6 +561,9 @@ Return Value: unsigned long hwcap2 = getauxval(AT_HWCAP2); bool HasP9Instructions = hwcap2 & PPC_FEATURE2_ARCH_3_00; +#elif defined(_AIX) + bool HasP9Instructions = __power_9_andup(); +#endif // __linux__ if (HasP9Instructions) { this->QuantizeLinearS8Kernel = MlasQuantizeLinearS8KernelVSX; this->QuantizeLinearU8Kernel = MlasQuantizeLinearU8KernelVSX; @@ -562,7 +572,11 @@ Return Value: #if defined(POWER10) #if (defined(__GNUC__) && ((__GNUC__ > 10) || (__GNUC__== 10 && __GNUC_MINOR__ >= 2))) || \ (defined(__clang__) && (__clang_major__ >= 12)) +#if defined(__linux__) bool HasP10Instructions = ((hwcap2 & PPC_FEATURE2_MMA) && (hwcap2 & PPC_FEATURE2_ARCH_3_1)); +#elif defined(_AIX) + bool HasP10Instructions = (__power_10_andup() && __power_mma_version() == MMA_V31); +#endif // __linux__ if (HasP10Instructions) { this->GemmFloatKernel = MlasSgemmKernelPOWER10; this->GemmDoubleKernel = MlasDgemmKernelPOWER10; @@ -571,7 +585,6 @@ Return Value: #endif #endif -#endif // __linux__ #endif // MLAS_TARGET_POWER #if defined(MLAS_TARGET_LARCH64) @@ -676,7 +689,6 @@ MlasPlatformU8S8Overflow( } #endif - thread_local size_t ThreadedBufSize = 0; #ifdef _MSC_VER thread_local std::unique_ptr ThreadedBufHolder(nullptr, &_aligned_free); diff --git a/onnxruntime/core/mlas/lib/power/qgemm_kernel_power10.cpp b/onnxruntime/core/mlas/lib/power/qgemm_kernel_power10.cpp index a67be1dbfa710..0f3bc1d579711 100644 --- a/onnxruntime/core/mlas/lib/power/qgemm_kernel_power10.cpp +++ b/onnxruntime/core/mlas/lib/power/qgemm_kernel_power10.cpp @@ -874,10 +874,18 @@ MlasQgemmStoreVectorMMA { size_t RowCount; __vector signed int vsum0, vsum1, vsum2, vsum3; +#if defined(_AIX) && defined(__clang__) + __vector signed int columnsum = *reinterpret_cast(&ColumnSumBuffer[pos]); +#else __vector signed int columnsum = *reinterpret_cast(&ColumnSumBuffer[pos]); +#endif C += VectorCount; if (ZeroPointB != nullptr) { +#if defined(_AIX) && defined(__clang__) + __vector signed int zeropoint = *reinterpret_cast(&ZeroPointB[pos]); +#else __vector signed int zeropoint = *reinterpret_cast(&ZeroPointB[pos]); +#endif if (ZeroMode) { for (RowCount = 0; RowCount + 4 <= row; RowCount += 4, C += ldc*4) { vsum0 = vec_splats(RowSumBuffer[RowCount + 0]) * zeropoint + columnsum; diff --git a/onnxruntime/core/mlas/lib/q4_dq.cpp b/onnxruntime/core/mlas/lib/q4_dq.cpp index 62fe58ca333de..015d69de68766 100644 --- a/onnxruntime/core/mlas/lib/q4_dq.cpp +++ b/onnxruntime/core/mlas/lib/q4_dq.cpp @@ -314,14 +314,18 @@ struct Shape2D { }; -template +template struct BitsTraits { static_assert(qbits <= 8, "Only BitsTraits are for small number of bits!"); static constexpr int kBits = qbits; - static constexpr int kMax = (1 << qbits) - 1; - static constexpr int kMid = 1 << (qbits - 1); + static constexpr int kMax = signed_quant ? (1 << (qbits -1)) - 1 : (1 << qbits) - 1; + static constexpr int kMid = signed_quant ? 0 : (1 << (qbits - 1)); + static constexpr int kMin = signed_quant ? -(1 << (qbits - 1)) : 0; static constexpr float kMaxFp = static_cast(kMax); + static constexpr float kMinFp = static_cast(kMin); + static constexpr float fullRange = kMaxFp - kMinFp; + static constexpr float halfRange = static_cast(kMid - kMin); // number of qbit elements to pack into whole bytes static constexpr int kPackSize = (qbits == 8) ? 1 : (qbits == 4) ? 2 : (qbits == 2) ? 4 : 0; @@ -331,53 +335,54 @@ struct BitsTraits { /** * @brief Rectify min/max from a set of weights, and convert to scale and zero point - * for quantization - * @tparam ScaleT type of scale, usually floating point of various bits - * @tparam qbits number of int bits used for zero point value + * for quantization. + * @tparam ScaleT type of scale, usually floating point of various bits + * @tparam qbits number of int bits used for zero point value + * @tparam signed_quant output quantized type is signed * @param[in] min * @param[in] max * @param[out] scale * @param[out] zp */ -template +template MLAS_FORCEINLINE void range2scalezp(float min, float max, ScaleT& scale, uint8_t& zp) { - constexpr int zp_max = BitsTraits::kMax; - constexpr float zp_max_fp = BitsTraits::kMaxFp; - min = std::min(min, 0.0f); max = std::max(max, 0.0f); - float scale_f = (max - min) / zp_max; + float scale_f = (max - min) / BitsTraits::fullRange; float zero_point_fp = min; if (scale_f != 0.0f) { - zero_point_fp = 0.f - min / scale_f; + zero_point_fp = BitsTraits::kMinFp - min / scale_f; } - if (zero_point_fp < 0.0f) { - zp = 0; - } else if (zero_point_fp > zp_max_fp) { - zp = zp_max; + if (zero_point_fp < BitsTraits::kMinFp) { + zp = static_cast(BitsTraits::kMin); + } else if (zero_point_fp > BitsTraits::kMaxFp) { + zp = static_cast(BitsTraits::kMax); } else { zp = (uint8_t)roundf(zero_point_fp); } scale = ScaleT(scale_f); } -template +/** + * @brief Rectify min/max from a set of symmetric weights, and convert + * to scale for quantization. + */ +template MLAS_FORCEINLINE void range2scale(float min, float max, ScaleT& scale) { - constexpr int mid_v = BitsTraits::kMid; - constexpr float mid_fp = static_cast(-mid_v); - max = fabsf(max) > fabsf(min) ? max : min; - - scale = ScaleT(max / mid_fp); + // !!Note: in the quantized space, abs of min -8 > abs of max 7. + // Therefore map the larger half FP space to [-8, 0]. + // Minus sign achieves this purpose. + scale = ScaleT(-max / BitsTraits::halfRange); }; @@ -400,7 +405,7 @@ struct BlockwiseQuantizer { static_assert(qbits == 4, "Only 4b block quantization is supported!"); using QuantBlk = std::conditional_t, Shape2D<1, block_size>>; - using ThreadBlk = Shape2D::kPackSize, QuantBlk::kColumn>; + using ThreadBlk = Shape2D::kPackSize, QuantBlk::kColumn>; static MLAS_FORCEINLINE @@ -474,8 +479,8 @@ struct BlockwiseQuantizer { MlasTryBatchParallel( thread_pool, total_thrd_blks, [&](ptrdiff_t block_idx) { - uint8_t zp_bytes[BitsTraits::kPackSize]; - std::fill_n(zp_bytes, BitsTraits::kPackSize, (uint8_t)8); + uint8_t zp_bytes[BitsTraits::kPackSize]; + std::fill_n(zp_bytes, BitsTraits::kPackSize, (uint8_t)8); const int32_t r_blk_idx = static_cast(block_idx / thrd_col_blks); const int32_t c_blk_idx = static_cast(block_idx % thrd_col_blks); @@ -490,7 +495,7 @@ struct BlockwiseQuantizer { const int meta_col = c / QuantBlk::kColumn; // compute scale and zero point - for (int kpack = 0; kpack < BitsTraits::kPackSize; kpack++) { + for (int kpack = 0; kpack < BitsTraits::kPackSize; kpack++) { // scan a single block to extract range [min, max] float min = std::numeric_limits::max(); @@ -509,9 +514,9 @@ struct BlockwiseQuantizer { if (row_start < row_end) { const int32_t meta_idx = meta_col * row_blks + meta_row + kpack; if (zero_points == nullptr) { - range2scale(min, max, scales[meta_idx]); + range2scale(min, max, scales[meta_idx]); } else { - range2scalezp(min, max, scales[meta_idx], zp_bytes[kpack]); + range2scalezp(min, max, scales[meta_idx], zp_bytes[kpack]); } } } @@ -533,7 +538,7 @@ struct BlockwiseQuantizer { const float v0 = static_cast(src[i * leadingDimension + j]); const uint8_t vi0 = (uint8_t)std::clamp(roundf(v0 * reciprocal_scale + zp), - 0.0f, BitsTraits::kMaxFp); + 0.0f, BitsTraits::kMaxFp); uint8_t vi1 = (uint8_t)zp; if (i + 1 < r_end) { @@ -545,7 +550,7 @@ struct BlockwiseQuantizer { } const float v1 = static_cast(src[(i + 1) * leadingDimension + j]); vi1 = (uint8_t)std::clamp(roundf(v1 * reciprocal_scale1 + zp1), 0.0f, - BitsTraits::kMaxFp); + BitsTraits::kMaxFp); } // !! 4b specific code @@ -644,14 +649,19 @@ struct BlockwiseQuantizer { * in memory are packed together, which means the packing is along the row. Quantized data * are stored in row major, so the output tensor reserves same shape, in terms of qbits type, * as the input tensor. - * @tparam Tin source data type, e.g. fp32/fp16 - * @tparam qbits number of bits in each quantized element + * If has zero points, quantized type is unsigned. Otherwise, quantized type is signed and the + * zero point is 0. + * The transposed outputs are used by MatMulNBits, so quant type becomes uint4 with default + * zp at 8. + * @tparam Tin source data type, e.g. fp32/fp16 + * @tparam qbits number of bits in each quantized element + * @tparam signed_quant quantized type is signed */ -template +template struct BlockwiseQDQQuantizer; -template -struct BlockwiseQDQQuantizer { +template +struct BlockwiseQDQQuantizer { static MLAS_FORCEINLINE uint8_t GetElem(uint8_t val, int32_t idx) { return (val >> (idx << 2)) & 0xF; @@ -663,9 +673,14 @@ struct BlockwiseQDQQuantizer { return ((val & 0xF) << shift) | (dst & (~(0xF << shift))); } + template static MLAS_FORCEINLINE uint8_t Pack(uint8_t v0, uint8_t v1) { - return (v0 & 0xF) | ((v1 & 0xF) << 4); + if constexpr (add8) { + return ((v0 & 0xF) ^ 8) | (((v1 & 0xF) ^ 8) << 4); + } else { + return (v0 & 0xF) | ((v1 & 0xF) << 4); + } } // If src is row major, then dst is column major. Transpose: @@ -680,10 +695,16 @@ struct BlockwiseQDQQuantizer { // --> // | dst0: low 4 bit | dst0: high 4 bit | // | dst1: low 4 bit | dst1: high 4 bit | + template static MLAS_FORCEINLINE void Transpose(uint8_t src0, uint8_t src1, uint8_t& dst0, uint8_t& dst1) { - dst0 = (src0 & 0xF) | ((src1 & 0xF) << 4); - dst1 = ((src0 & 0xF0) >> 4) | (src1 & 0xF0); + if constexpr (add8) { + dst0 = ((src0 & 0xF) ^ 8) | (((src1 & 0xF) ^ 8) << 4); + dst1 = (((src0 & 0xF0) ^ 0x80) >> 4) | ((src1 & 0xF0) ^ 0x80); + } else { + dst0 = (src0 & 0xF) | ((src1 & 0xF) << 4); + dst1 = ((src0 & 0xF0) >> 4) | (src1 & 0xF0); + } } static MLAS_FORCEINLINE uint8_t QuantizeV(Tin src, float reciprocal_scale, uint8_t zero_point) @@ -693,54 +714,12 @@ struct BlockwiseQDQQuantizer { static_cast( std::roundf(static_cast(src) * reciprocal_scale) ) + static_cast(zero_point), - 0, - BitsTraits<4>::kMax + BitsTraits<4, signed_quant>::kMin, + BitsTraits<4, signed_quant>::kMax ) ); } - /** - * @brief Quantize a matrix shape [rows, columns] row-wise. Scales and zero points are calculated. - * Quantized data are packed row-wise based on qbits. Quantized data are stored in row - * major, so the output tensor reserves the shape, in terms output type. - * Thread block is [1, quant_block_size * 2]. - * @param src the source matrix, row major: [rows * columns] - * @param scales the scales of quantized blocks, row major layout with shape: - * [rows * ceil(columns / quant_block_size)] - * @param zero_points the zero points of quantized blocks, packed. Same shape as scales - * in terms of output type. In terms of uint8_t, the shape is: - * [ceil(rows * ceil(columns / quant_block_size) * qbits / 8)] - * @param dst the quantized weights, row major: [rows * columns] in terms of - * output type. In terms of uint8_t, the shape is: [ceil(rows * columns * qbits / 8] - * @param rows number of rows in the source matrix - * @param columns number of columns in the source matrix, must satisfy - * ceil(columns / quant_block_size) % 2 == 0, so in each thread block, - * zero points are packed into one byte. - * @param quant_block_size number of elements quantized together. - * @param thread_pool thread pool for parallel processing - */ - static void QuantizeRowWise( - const Tin* src, - Tin* scales, - uint8_t* zero_points, - uint8_t* dst, - int32_t rows, - int32_t columns, - int32_t quant_block_size, - MLAS_THREADPOOL* thread_pool - ) - { - MLAS_UNREFERENCED_PARAMETER(src); - MLAS_UNREFERENCED_PARAMETER(scales); - MLAS_UNREFERENCED_PARAMETER(zero_points); - MLAS_UNREFERENCED_PARAMETER(dst); - MLAS_UNREFERENCED_PARAMETER(rows); - MLAS_UNREFERENCED_PARAMETER(columns); - MLAS_UNREFERENCED_PARAMETER(quant_block_size); - MLAS_UNREFERENCED_PARAMETER(thread_pool); - ORT_THROW("BlockwiseQDQQuantizer::BlockwiseQDQQuantizer is not implemented"); - } - /** * @brief Quantize a matrix shape [rows, columns] column-wise. Scales and zero points are calculated. * Quantized data are packed row-wise based on qbits. Quantized data are stored in row major @@ -769,6 +748,7 @@ struct BlockwiseQDQQuantizer { MLAS_THREADPOOL* thread_pool ) { + ORT_ENFORCE(zero_points || signed_quant, "Unsigned quant with no zero points is not supported."); // Must avoid multiple thread write to a single byte, which means the starting index // of a thread block must be even. To achieve that, we need to customize the thread // block size based on the parity of columns. @@ -815,6 +795,10 @@ struct BlockwiseQDQQuantizer { MLAS_THREADPOOL* thread_pool ) { + ORT_ENFORCE( + src_zero_points || signed_quant || dst_zero_points, + "Unsigned quant types without zero points must allocate zero points with value 0." + ); // Must avoid multiple thread write to a single byte, which means the starting index // of a thread block must be even. To achieve that, we need to customize the thread // block size based on the parity of columns. @@ -896,15 +880,15 @@ struct BlockwiseQDQQuantizer { // calculate scale and zero point, and store for (int32_t i = 0; i < col_size; i += 2) { - v0_tt = v1_tt = BitsTraits<4>::kMid; + v0_tt = v1_tt = BitsTraits<4, signed_quant>::kMid; if (zero_points) { - range2scalezp(vmin_t[i], vmax_t[i], scale0_tt, v0_tt); - range2scalezp(vmin_t[i + 1], vmax_t[i + 1], scale1_tt, v1_tt); - zero_points[(scale_idx + i) >> 1] = Pack(v0_tt, v1_tt); + range2scalezp(vmin_t[i], vmax_t[i], scale0_tt, v0_tt); + range2scalezp(vmin_t[i + 1], vmax_t[i + 1], scale1_tt, v1_tt); + zero_points[(scale_idx + i) >> 1] = Pack(v0_tt, v1_tt); } else { - range2scale(vmin_t[i], vmax_t[i], scale0_tt); - range2scale(vmin_t[i + 1], vmax_t[i + 1], scale1_tt); + range2scale(vmin_t[i], vmax_t[i], scale0_tt); + range2scale(vmin_t[i + 1], vmax_t[i + 1], scale1_tt); } scales[scale_idx + i] = scale0_tt; @@ -925,7 +909,7 @@ struct BlockwiseQDQQuantizer { for (int32_t i = 0; i < col_size; i += 2) { v0_tt = QuantizeV(src[input_idx_t + i], reciprocal_scale_t[i], zp_t[i]); v1_tt = QuantizeV(src[input_idx_t + i + 1], reciprocal_scale_t[i + 1], zp_t[i + 1]); - dst[(input_idx_t + i) >> 1] = Pack(v0_tt, v1_tt); + dst[(input_idx_t + i) >> 1] = Pack(v0_tt, v1_tt); } } } @@ -993,14 +977,14 @@ struct BlockwiseQDQQuantizer { int32_t col_idx = 0; // leading unailgned zero points if (scale_buffer_idx & 1) { - v0_tt = BitsTraits<4>::kMid; + v0_tt = BitsTraits<4, signed_quant>::kMid; if (zero_points) { - range2scalezp(vmin_t[0], vmax_t[0], scale0_tt, v0_tt); + range2scalezp(vmin_t[0], vmax_t[0], scale0_tt, v0_tt); zero_points[scale_buffer_idx >> 1] = SetElem( v0_tt, 1, zero_points[scale_buffer_idx >> 1] ); } else { - range2scale(vmin_t[0], vmax_t[0], scale0_tt); + range2scale(vmin_t[0], vmax_t[0], scale0_tt); } scales[scale_buffer_idx] = scale0_tt; @@ -1014,14 +998,16 @@ struct BlockwiseQDQQuantizer { } // aligned zero points for (; scale_buffer_idx < scale_buffer_idx_end - 1; col_idx += 2, scale_buffer_idx += 2) { - v0_tt = v1_tt = BitsTraits<4>::kMid; + v0_tt = v1_tt = BitsTraits<4, signed_quant>::kMid; if (zero_points) { - range2scalezp(vmin_t[col_idx], vmax_t[col_idx], scale0_tt, v0_tt); - range2scalezp(vmin_t[col_idx + 1], vmax_t[col_idx + 1], scale1_tt, v1_tt); - zero_points[scale_buffer_idx >> 1] = Pack(v0_tt, v1_tt); + range2scalezp(vmin_t[col_idx], vmax_t[col_idx], scale0_tt, v0_tt); + range2scalezp( + vmin_t[col_idx + 1], vmax_t[col_idx + 1], scale1_tt, v1_tt + ); + zero_points[scale_buffer_idx >> 1] = Pack(v0_tt, v1_tt); } else { - range2scale(vmin_t[col_idx], vmax_t[col_idx], scale0_tt); - range2scale(vmin_t[col_idx + 1], vmax_t[col_idx + 1], scale1_tt); + range2scale(vmin_t[col_idx], vmax_t[col_idx], scale0_tt); + range2scale(vmin_t[col_idx + 1], vmax_t[col_idx + 1], scale1_tt); } scales[scale_buffer_idx] = scale0_tt; @@ -1037,14 +1023,14 @@ struct BlockwiseQDQQuantizer { } // tailing unaligned elements if (scale_buffer_idx < scale_buffer_idx_end) { - v0_tt = BitsTraits<4>::kMid; + v0_tt = BitsTraits<4, signed_quant>::kMid; if (zero_points) { - range2scalezp(vmin_t[col_idx], vmax_t[col_idx], scale0_tt, v0_tt); + range2scalezp(vmin_t[col_idx], vmax_t[col_idx], scale0_tt, v0_tt); zero_points[scale_buffer_idx >> 1] = SetElem( v0_tt, 0, zero_points[scale_buffer_idx >> 1] ); } else { - range2scale(vmin_t[col_idx], vmax_t[col_idx], scale0_tt); + range2scale(vmin_t[col_idx], vmax_t[col_idx], scale0_tt); } scales[scale_buffer_idx] = scale0_tt; @@ -1078,7 +1064,7 @@ struct BlockwiseQDQQuantizer { src[input_idx_t_start + 1], reciprocal_scale_t[col_idx + 1], zp_t[col_idx + 1] ); - dst[input_idx_t_start >> 1] = Pack(v0_tt, v1_tt); + dst[input_idx_t_start >> 1] = Pack(v0_tt, v1_tt); } // tailing unaligned output if (input_idx_t_start < input_idx_t_end) { @@ -1144,7 +1130,7 @@ struct BlockwiseQDQQuantizer { src0_t = src_weights[src_idx]; src1_t = src_weights[src_idx + packed_col_size]; src_idx += packed_col_size + packed_col_size; - Transpose(src0_t, src1_t, dst0_t, dst1_t); + Transpose(src0_t, src1_t, dst0_t, dst1_t); dst_weights[dst_idx] = dst0_t; dst_weights[dst_idx + dstT_num_row] = dst1_t; } @@ -1152,7 +1138,7 @@ struct BlockwiseQDQQuantizer { if (src_idx < src_end_idx) { src0_t = src_weights[src_idx]; src1_t = 0; - Transpose(src0_t, src1_t, dst0_t, dst1_t); + Transpose(src0_t, src1_t, dst0_t, dst1_t); dst_weights[dst_idx] = dst0_t; dst_weights[dst_idx + dstT_num_row] = dst1_t; } @@ -1190,7 +1176,7 @@ struct BlockwiseQDQQuantizer { for (; src_idx < src_end_idx - packed_col_size; ++dst_idx) { src0_t = src_zero_points[src_idx]; src1_t = src_zero_points[src_idx + packed_col_size]; - Transpose(src0_t, src1_t, dst0_t, dst1_t); + Transpose(src0_t, src1_t, dst0_t, dst1_t); dst_zero_points[dst_idx] = dst0_t; dst_zero_points[dst_idx + dst_zp_row_num] = dst1_t; src_idx += packed_col_size + packed_col_size; @@ -1199,7 +1185,7 @@ struct BlockwiseQDQQuantizer { if (src_idx < src_end_idx) { src0_t = src_zero_points[src_idx]; src1_t = 0; - Transpose(src0_t, src1_t, dst0_t, dst1_t); + Transpose(src0_t, src1_t, dst0_t, dst1_t); dst_zero_points[dst_idx] = dst0_t; dst_zero_points[dst_idx + dst_zp_row_num] = dst1_t; } @@ -1247,13 +1233,13 @@ struct BlockwiseQDQQuantizer { for (; src_idx < src_end_idx - columns; ++dst_idx) { src0_t = GetElem(src_weights[src_idx >> 1], src_idx & 1); src1_t = GetElem(src_weights[(src_idx + columns) >> 1], (src_idx + columns) & 1); - dst_weights[dst_idx] = (src0_t & 0xf) | ((src1_t & 0xf) << 4); + dst_weights[dst_idx] = Pack(src0_t, src1_t); src_idx += columns + columns; } if (src_idx < src_end_idx) { src0_t = GetElem(src_weights[src_idx >> 1], src_idx & 1); - dst_weights[dst_idx] = src0_t & 0xf; + dst_weights[dst_idx] = Pack(src0_t, 0); } } ); @@ -1288,13 +1274,13 @@ struct BlockwiseQDQQuantizer { for (; src_idx < src_end_idx - columns; ++dst_idx) { src0_t = GetElem(src_zero_points[src_idx >> 1], src_idx & 1); src1_t = GetElem(src_zero_points[(src_idx + columns) >> 1], (src_idx + columns) & 1); - dst_zero_points[dst_idx] = (src0_t & 0xf) | ((src1_t & 0xf) << 4); + dst_zero_points[dst_idx] = Pack(src0_t, src1_t); src_idx += columns + columns; } if (src_idx < src_end_idx) { src0_t = GetElem(src_zero_points[src_idx >> 1], src_idx & 1); - dst_zero_points[dst_idx] = src0_t & 0xf; + dst_zero_points[dst_idx] = Pack(src0_t, 0); } } ); @@ -1745,7 +1731,7 @@ MlasDequantizeBlockwise( ); template -void +bool MlasQDQQuantizeBlockwise( const Tin* src, Tin* scales, @@ -1759,17 +1745,23 @@ MlasQDQQuantizeBlockwise( ) { if (columnwise) { - BlockwiseQDQQuantizer::QuantizeColumnWise( - src, scales, zero_points, dst, rows, columns, quant_block_size, thread_pool - ); + if (zero_points) { + BlockwiseQDQQuantizer::QuantizeColumnWise( + src, scales, zero_points, dst, rows, columns, quant_block_size, thread_pool + ); + return false; + } else { + BlockwiseQDQQuantizer::QuantizeColumnWise( + src, scales, zero_points, dst, rows, columns, quant_block_size, thread_pool + ); + return true; + } } else { - BlockwiseQDQQuantizer::QuantizeRowWise( - src, scales, zero_points, dst, rows, columns, quant_block_size, thread_pool - ); + ORT_THROW("Row-wise MlasQDQQuantizeBlockwise is not implemented"); } } -template void +template bool MlasQDQQuantizeBlockwise( const float* src, float* scales, @@ -1782,7 +1774,7 @@ MlasQDQQuantizeBlockwise( MLAS_THREADPOOL* thread_pool ); -template void +template bool MlasQDQQuantizeBlockwise( const MLAS_FP16* src, MLAS_FP16* scales, @@ -1795,7 +1787,7 @@ MlasQDQQuantizeBlockwise( MLAS_THREADPOOL* thread_pool ); -template +template void MlasQDQTransposeBlockwiseQuantized( const uint8_t* src_weights, @@ -1812,7 +1804,7 @@ MlasQDQTransposeBlockwiseQuantized( ) { if (columnwise) { - BlockwiseQDQQuantizer::TransposeColumnWiseQuantized( + BlockwiseQDQQuantizer::TransposeColumnWiseQuantized( src_weights, src_scales, src_zero_points, dst_weights, dst_scales, dst_zero_points, rows, columns, quant_block_size, thread_pool ); @@ -1822,7 +1814,22 @@ MlasQDQTransposeBlockwiseQuantized( } template void -MlasQDQTransposeBlockwiseQuantized( +MlasQDQTransposeBlockwiseQuantized( + const uint8_t* src_weights, + const float* src_scales, + const uint8_t* src_zero_points, + uint8_t* dst_weights, + float* dst_scales, + uint8_t* dst_zero_points, + bool columnwise, + int rows, + int columns, + int quant_block_size, + MLAS_THREADPOOL* thread_pool +); + +template void +MlasQDQTransposeBlockwiseQuantized( const uint8_t* src_weights, const float* src_scales, const uint8_t* src_zero_points, @@ -1837,7 +1844,22 @@ MlasQDQTransposeBlockwiseQuantized( ); template void -MlasQDQTransposeBlockwiseQuantized( +MlasQDQTransposeBlockwiseQuantized( + const uint8_t* src_weights, + const MLAS_FP16* src_scales, + const uint8_t* src_zero_points, + uint8_t* dst_weights, + MLAS_FP16* dst_scales, + uint8_t* dst_zero_points, + bool columnwise, + int rows, + int columns, + int quant_block_size, + MLAS_THREADPOOL* thread_pool +); + +template void +MlasQDQTransposeBlockwiseQuantized( const uint8_t* src_weights, const MLAS_FP16* src_scales, const uint8_t* src_zero_points, diff --git a/onnxruntime/core/mlas/lib/qgemm.h b/onnxruntime/core/mlas/lib/qgemm.h index 75c17a6b5a177..127aea9029b65 100644 --- a/onnxruntime/core/mlas/lib/qgemm.h +++ b/onnxruntime/core/mlas/lib/qgemm.h @@ -894,7 +894,7 @@ MlasGemmQuantGetDispatch( if (!AIsSigned) { GemmQuantDispatch = &MlasGemmU8X8DispatchWasmSimd; } -#elif defined(MLAS_TARGET_POWER) && defined(__linux__) && defined(POWER10) && \ +#elif defined(MLAS_TARGET_POWER) && (defined(__linux__) || defined(_AIX)) && defined(POWER10) && \ ((defined(__GNUC__) && ((__GNUC__ > 10) || (__GNUC__== 10 && __GNUC_MINOR__ >= 2))) || \ (defined(__clang__) && (__clang_major__ >= 12))) if (GetMlasPlatform().GemmU8X8Dispatch == &MlasGemm8X8DispatchPOWER10) { diff --git a/onnxruntime/core/mlas/lib/qlmul.cpp b/onnxruntime/core/mlas/lib/qlmul.cpp index 38818e1190d21..4a6d57db0d211 100644 --- a/onnxruntime/core/mlas/lib/qlmul.cpp +++ b/onnxruntime/core/mlas/lib/qlmul.cpp @@ -325,12 +325,20 @@ MlasQLinearMulKernel( } while (N >= 4) { - __vector int32_t IntegerAVector {InputA[0], InputA[1], InputA[2], InputA[3]}; +#if defined(_AIX) && defined(__clang__) + __vector int IntegerAVector {InputA[0], InputA[1], InputA[2], InputA[3]}; +#else + __vector int32_t IntegerAVector {InputA[0], InputA[1], InputA[2], InputA[3]}; +#endif auto IntegerVector = vec_sub(IntegerAVector, ZeroPointAVector); auto ValueAVector = vec_mul(ScaleAVector, vec_ctf(IntegerVector, 0)); if (!IsScalarB) { - __vector int32_t IntegerBVector {InputB[0], InputB[1], InputB[2], InputB[3]}; +#if defined(_AIX) && defined(__clang__) + __vector int IntegerBVector {InputB[0], InputB[1], InputB[2], InputB[3]}; +#else + __vector int32_t IntegerBVector {InputB[0], InputB[1], InputB[2], InputB[3]}; +#endif IntegerVector = vec_sub(IntegerBVector, ZeroPointBVector); ValueBVector = vec_mul(ScaleBVector, vec_ctf(IntegerVector, 0)); } diff --git a/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_neon_int8.cpp b/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_neon_int8.cpp index db3b9ee656592..ec5cdbc75220a 100644 --- a/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_neon_int8.cpp +++ b/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_neon_int8.cpp @@ -155,7 +155,7 @@ namespace template MLAS_FORCEINLINE void -SQ4BitGemm_CompInt8_Compute2x2_BlkLen16( +SQ4BitGemm_CompInt8_Compute4x2_BlkLen16( const std::byte* QuantARowPtr, const std::byte* QuantBDataColPtr, const float* QuantBScaleColPtr, @@ -177,11 +177,13 @@ SQ4BitGemm_CompInt8_Compute2x2_BlkLen16( const float* QuantBScalePtr = QuantBScaleColPtr; const std::byte* QuantBZeroPointPtr = QuantBZeroPointColPtr; - float32x4_t acc00{}, acc01{}, acc10{}, acc11{}; + float32x4_t acc00{}, acc01{}, acc10{}, acc11{}, acc20{}, acc21{}, acc30{}, acc31{}; for (size_t k_blk_idx = 0; k_blk_idx < BlockCountK; ++k_blk_idx) { const std::byte* QuantABlkRow0 = QuantAPtr; const std::byte* QuantABlkRow1 = QuantAPtr + StrideQuantA; + const std::byte* QuantABlkRow2 = QuantAPtr + StrideQuantA * 2; + const std::byte* QuantABlkRow3 = QuantAPtr + StrideQuantA * 3; const float QuantBScaleCol0 = *QuantBScalePtr; const float QuantBScaleCol1 = *(QuantBScalePtr + StrideQuantBScale); @@ -191,6 +193,10 @@ SQ4BitGemm_CompInt8_Compute2x2_BlkLen16( const float scale01 = Q8BlkScale(QuantABlkRow0) * QuantBScaleCol1; const float scale10 = Q8BlkScale(QuantABlkRow1) * QuantBScaleCol0; const float scale11 = Q8BlkScale(QuantABlkRow1) * QuantBScaleCol1; + const float scale20 = Q8BlkScale(QuantABlkRow2) * QuantBScaleCol0; + const float scale21 = Q8BlkScale(QuantABlkRow2) * QuantBScaleCol1; + const float scale30 = Q8BlkScale(QuantABlkRow3) * QuantBScaleCol0; + const float scale31 = Q8BlkScale(QuantABlkRow3) * QuantBScaleCol1; // load B zero point int8_t bzp_col0; @@ -212,13 +218,11 @@ SQ4BitGemm_CompInt8_Compute2x2_BlkLen16( const int8_t* QuantADataPtrRow0 = Q8BlkData(QuantABlkRow0); const int8_t* QuantADataPtrRow1 = Q8BlkData(QuantABlkRow1); + const int8_t* QuantADataPtrRow2 = Q8BlkData(QuantABlkRow2); + const int8_t* QuantADataPtrRow3 = Q8BlkData(QuantABlkRow3); // TODO handling only 16 elements per accumulator at a time here, probably can do better { - // load A - const int8x16_t av_row0 = vld1q_s8(QuantADataPtrRow0 + 0); - const int8x16_t av_row1 = vld1q_s8(QuantADataPtrRow1 + 0); - // load B const uint8x8_t bv_packed_col0 = vld1_u8(reinterpret_cast(QuantBDataPtr)); const uint8x8_t bv_packed_col1 = vld1_u8(reinterpret_cast(QuantBDataPtr) + StrideQuantBData); @@ -242,24 +246,55 @@ SQ4BitGemm_CompInt8_Compute2x2_BlkLen16( bv_col0 = vsubq_s8(bv_col0, vdupq_n_s8(bzp_col0)); bv_col1 = vsubq_s8(bv_col1, vdupq_n_s8(bzp_col1)); - // quantized dot product - int32x4_t dot00{}, dot01{}, dot10{}, dot11{}; - dot00 = vdotq_s32(dot00, av_row0, bv_col0); - dot01 = vdotq_s32(dot01, av_row0, bv_col1); - dot10 = vdotq_s32(dot10, av_row1, bv_col0); - dot11 = vdotq_s32(dot11, av_row1, bv_col1); - - // convert to float - const float32x4_t dot_f32_00 = vcvtq_f32_s32(dot00); - const float32x4_t dot_f32_01 = vcvtq_f32_s32(dot01); - const float32x4_t dot_f32_10 = vcvtq_f32_s32(dot10); - const float32x4_t dot_f32_11 = vcvtq_f32_s32(dot11); + // rows 0 and 1 of A + { + // load A + const int8x16_t av_row0 = vld1q_s8(QuantADataPtrRow0 + 0); + const int8x16_t av_row1 = vld1q_s8(QuantADataPtrRow1 + 0); + + // quantized dot product + const int32x4_t dot00 = vdotq_s32(int32x4_t{}, av_row0, bv_col0); + const int32x4_t dot01 = vdotq_s32(int32x4_t{}, av_row0, bv_col1); + const int32x4_t dot10 = vdotq_s32(int32x4_t{}, av_row1, bv_col0); + const int32x4_t dot11 = vdotq_s32(int32x4_t{}, av_row1, bv_col1); + + // convert to float + const float32x4_t dot_f32_00 = vcvtq_f32_s32(dot00); + const float32x4_t dot_f32_01 = vcvtq_f32_s32(dot01); + const float32x4_t dot_f32_10 = vcvtq_f32_s32(dot10); + const float32x4_t dot_f32_11 = vcvtq_f32_s32(dot11); + + // multiply by scale and update accumulator + acc00 = vfmaq_f32(acc00, dot_f32_00, vdupq_n_f32(scale00)); + acc01 = vfmaq_f32(acc01, dot_f32_01, vdupq_n_f32(scale01)); + acc10 = vfmaq_f32(acc10, dot_f32_10, vdupq_n_f32(scale10)); + acc11 = vfmaq_f32(acc11, dot_f32_11, vdupq_n_f32(scale11)); + } - // multiply by scale and update accumulator - acc00 = vfmaq_f32(acc00, dot_f32_00, vdupq_n_f32(scale00)); - acc01 = vfmaq_f32(acc01, dot_f32_01, vdupq_n_f32(scale01)); - acc10 = vfmaq_f32(acc10, dot_f32_10, vdupq_n_f32(scale10)); - acc11 = vfmaq_f32(acc11, dot_f32_11, vdupq_n_f32(scale11)); + // rows 2 and 3 of A + { + // load A + const int8x16_t av_row2 = vld1q_s8(QuantADataPtrRow2 + 0); + const int8x16_t av_row3 = vld1q_s8(QuantADataPtrRow3 + 0); + + // quantized dot product + const int32x4_t dot20 = vdotq_s32(int32x4_t{}, av_row2, bv_col0); + const int32x4_t dot21 = vdotq_s32(int32x4_t{}, av_row2, bv_col1); + const int32x4_t dot30 = vdotq_s32(int32x4_t{}, av_row3, bv_col0); + const int32x4_t dot31 = vdotq_s32(int32x4_t{}, av_row3, bv_col1); + + // convert to float + const float32x4_t dot_f32_20 = vcvtq_f32_s32(dot20); + const float32x4_t dot_f32_21 = vcvtq_f32_s32(dot21); + const float32x4_t dot_f32_30 = vcvtq_f32_s32(dot30); + const float32x4_t dot_f32_31 = vcvtq_f32_s32(dot31); + + // multiply by scale and update accumulator + acc20 = vfmaq_f32(acc20, dot_f32_20, vdupq_n_f32(scale20)); + acc21 = vfmaq_f32(acc21, dot_f32_21, vdupq_n_f32(scale21)); + acc30 = vfmaq_f32(acc30, dot_f32_30, vdupq_n_f32(scale30)); + acc31 = vfmaq_f32(acc31, dot_f32_31, vdupq_n_f32(scale31)); + } } // increment block pointers @@ -273,22 +308,30 @@ SQ4BitGemm_CompInt8_Compute2x2_BlkLen16( } } - SumPtr[0] = vaddvq_f32(acc00); - SumPtr[1] = vaddvq_f32(acc01); - SumPtr[ldc + 0] = vaddvq_f32(acc10); - SumPtr[ldc + 1] = vaddvq_f32(acc11); + SumPtr[ldc * 0 + 0] = vaddvq_f32(acc00); + SumPtr[ldc * 0 + 1] = vaddvq_f32(acc01); + SumPtr[ldc * 1 + 0] = vaddvq_f32(acc10); + SumPtr[ldc * 1 + 1] = vaddvq_f32(acc11); + SumPtr[ldc * 2 + 0] = vaddvq_f32(acc20); + SumPtr[ldc * 2 + 1] = vaddvq_f32(acc21); + SumPtr[ldc * 3 + 0] = vaddvq_f32(acc30); + SumPtr[ldc * 3 + 1] = vaddvq_f32(acc31); if (BiasPtr != nullptr) { - SumPtr[0] += BiasPtr[0]; - SumPtr[1] += BiasPtr[1]; - SumPtr[ldc + 0] += BiasPtr[0]; - SumPtr[ldc + 1] += BiasPtr[1]; + SumPtr[ldc * 0 + 0] += BiasPtr[0]; + SumPtr[ldc * 0 + 1] += BiasPtr[1]; + SumPtr[ldc * 1 + 0] += BiasPtr[0]; + SumPtr[ldc * 1 + 1] += BiasPtr[1]; + SumPtr[ldc * 2 + 0] += BiasPtr[0]; + SumPtr[ldc * 2 + 1] += BiasPtr[1]; + SumPtr[ldc * 3 + 0] += BiasPtr[0]; + SumPtr[ldc * 3 + 1] += BiasPtr[1]; } } template MLAS_FORCEINLINE void -SQ4BitGemm_CompInt8_Compute2x2_BlkLenGreaterThan16( +SQ4BitGemm_CompInt8_Compute4x2_BlkLenGreaterThan16( size_t BlkLen, const std::byte* QuantARowPtr, const std::byte* QuantBDataColPtr, @@ -312,11 +355,13 @@ SQ4BitGemm_CompInt8_Compute2x2_BlkLenGreaterThan16( const float* QuantBScalePtr = QuantBScaleColPtr; const std::byte* QuantBZeroPointPtr = QuantBZeroPointColPtr; - float32x4_t acc00{}, acc01{}, acc10{}, acc11{}; + float32x4_t acc00{}, acc01{}, acc10{}, acc11{}, acc20{}, acc21{}, acc30{}, acc31{}; for (size_t k_blk_idx = 0; k_blk_idx < BlockCountK; ++k_blk_idx) { const std::byte* QuantABlkRow0 = QuantAPtr; const std::byte* QuantABlkRow1 = QuantAPtr + StrideQuantA; + const std::byte* QuantABlkRow2 = QuantAPtr + StrideQuantA * 2; + const std::byte* QuantABlkRow3 = QuantAPtr + StrideQuantA * 3; const float QuantBScaleCol0 = *QuantBScalePtr; const float QuantBScaleCol1 = *(QuantBScalePtr + StrideQuantBScale); @@ -326,6 +371,10 @@ SQ4BitGemm_CompInt8_Compute2x2_BlkLenGreaterThan16( const float scale01 = Q8BlkScale(QuantABlkRow0) * QuantBScaleCol1; const float scale10 = Q8BlkScale(QuantABlkRow1) * QuantBScaleCol0; const float scale11 = Q8BlkScale(QuantABlkRow1) * QuantBScaleCol1; + const float scale20 = Q8BlkScale(QuantABlkRow2) * QuantBScaleCol0; + const float scale21 = Q8BlkScale(QuantABlkRow2) * QuantBScaleCol1; + const float scale30 = Q8BlkScale(QuantABlkRow3) * QuantBScaleCol0; + const float scale31 = Q8BlkScale(QuantABlkRow3) * QuantBScaleCol1; // load B zero point int8_t bzp_col0; @@ -347,14 +396,10 @@ SQ4BitGemm_CompInt8_Compute2x2_BlkLenGreaterThan16( const int8_t* QuantADataPtrRow0 = Q8BlkData(QuantABlkRow0); const int8_t* QuantADataPtrRow1 = Q8BlkData(QuantABlkRow1); + const int8_t* QuantADataPtrRow2 = Q8BlkData(QuantABlkRow2); + const int8_t* QuantADataPtrRow3 = Q8BlkData(QuantABlkRow3); for (size_t sub_blk_idx = 0; sub_blk_idx < SubBlksPerBlk; ++sub_blk_idx) { - // load A - const int8x16_t av_row0_0 = vld1q_s8(QuantADataPtrRow0 + 0); - const int8x16_t av_row0_1 = vld1q_s8(QuantADataPtrRow0 + 16); - const int8x16_t av_row1_0 = vld1q_s8(QuantADataPtrRow1 + 0); - const int8x16_t av_row1_1 = vld1q_s8(QuantADataPtrRow1 + 16); - // load B const uint8x16_t bv_packed_col0 = vld1q_u8(reinterpret_cast(QuantBDataPtr)); const uint8x16_t bv_packed_col1 = vld1q_u8(reinterpret_cast(QuantBDataPtr) + StrideQuantBData); @@ -372,28 +417,65 @@ SQ4BitGemm_CompInt8_Compute2x2_BlkLenGreaterThan16( bv_col1_0 = vsubq_s8(bv_col1_0, vdupq_n_s8(bzp_col1)); bv_col1_1 = vsubq_s8(bv_col1_1, vdupq_n_s8(bzp_col1)); - // quantized dot product - int32x4_t dot00{}, dot01{}, dot10{}, dot11{}; - dot00 = vdotq_s32(vdotq_s32(dot00, av_row0_0, bv_col0_0), av_row0_1, bv_col0_1); - dot01 = vdotq_s32(vdotq_s32(dot01, av_row0_0, bv_col1_0), av_row0_1, bv_col1_1); - dot10 = vdotq_s32(vdotq_s32(dot10, av_row1_0, bv_col0_0), av_row1_1, bv_col0_1); - dot11 = vdotq_s32(vdotq_s32(dot11, av_row1_0, bv_col1_0), av_row1_1, bv_col1_1); - - // convert to float - const float32x4_t dot_f32_00 = vcvtq_f32_s32(dot00); - const float32x4_t dot_f32_01 = vcvtq_f32_s32(dot01); - const float32x4_t dot_f32_10 = vcvtq_f32_s32(dot10); - const float32x4_t dot_f32_11 = vcvtq_f32_s32(dot11); + // rows 0 and 1 of A + { + // load A + const int8x16_t av_row0_0 = vld1q_s8(QuantADataPtrRow0 + 0); + const int8x16_t av_row0_1 = vld1q_s8(QuantADataPtrRow0 + 16); + const int8x16_t av_row1_0 = vld1q_s8(QuantADataPtrRow1 + 0); + const int8x16_t av_row1_1 = vld1q_s8(QuantADataPtrRow1 + 16); + + // quantized dot product + const int32x4_t dot00 = vdotq_s32(vdotq_s32(int32x4_t{}, av_row0_0, bv_col0_0), av_row0_1, bv_col0_1); + const int32x4_t dot01 = vdotq_s32(vdotq_s32(int32x4_t{}, av_row0_0, bv_col1_0), av_row0_1, bv_col1_1); + const int32x4_t dot10 = vdotq_s32(vdotq_s32(int32x4_t{}, av_row1_0, bv_col0_0), av_row1_1, bv_col0_1); + const int32x4_t dot11 = vdotq_s32(vdotq_s32(int32x4_t{}, av_row1_0, bv_col1_0), av_row1_1, bv_col1_1); + + // convert to float + const float32x4_t dot_f32_00 = vcvtq_f32_s32(dot00); + const float32x4_t dot_f32_01 = vcvtq_f32_s32(dot01); + const float32x4_t dot_f32_10 = vcvtq_f32_s32(dot10); + const float32x4_t dot_f32_11 = vcvtq_f32_s32(dot11); + + // multiply by scale and update accumulator + acc00 = vfmaq_f32(acc00, dot_f32_00, vdupq_n_f32(scale00)); + acc01 = vfmaq_f32(acc01, dot_f32_01, vdupq_n_f32(scale01)); + acc10 = vfmaq_f32(acc10, dot_f32_10, vdupq_n_f32(scale10)); + acc11 = vfmaq_f32(acc11, dot_f32_11, vdupq_n_f32(scale11)); + } - // multiply by scale and update accumulator - acc00 = vfmaq_f32(acc00, dot_f32_00, vdupq_n_f32(scale00)); - acc01 = vfmaq_f32(acc01, dot_f32_01, vdupq_n_f32(scale01)); - acc10 = vfmaq_f32(acc10, dot_f32_10, vdupq_n_f32(scale10)); - acc11 = vfmaq_f32(acc11, dot_f32_11, vdupq_n_f32(scale11)); + // rows 2 and 3 of A + { + // load A + const int8x16_t av_row2_0 = vld1q_s8(QuantADataPtrRow2 + 0); + const int8x16_t av_row2_1 = vld1q_s8(QuantADataPtrRow2 + 16); + const int8x16_t av_row3_0 = vld1q_s8(QuantADataPtrRow3 + 0); + const int8x16_t av_row3_1 = vld1q_s8(QuantADataPtrRow3 + 16); + + // quantized dot product + const int32x4_t dot20 = vdotq_s32(vdotq_s32(int32x4_t{}, av_row2_0, bv_col0_0), av_row2_1, bv_col0_1); + const int32x4_t dot21 = vdotq_s32(vdotq_s32(int32x4_t{}, av_row2_0, bv_col1_0), av_row2_1, bv_col1_1); + const int32x4_t dot30 = vdotq_s32(vdotq_s32(int32x4_t{}, av_row3_0, bv_col0_0), av_row3_1, bv_col0_1); + const int32x4_t dot31 = vdotq_s32(vdotq_s32(int32x4_t{}, av_row3_0, bv_col1_0), av_row3_1, bv_col1_1); + + // convert to float + const float32x4_t dot_f32_20 = vcvtq_f32_s32(dot20); + const float32x4_t dot_f32_21 = vcvtq_f32_s32(dot21); + const float32x4_t dot_f32_30 = vcvtq_f32_s32(dot30); + const float32x4_t dot_f32_31 = vcvtq_f32_s32(dot31); + + // multiply by scale and update accumulator + acc20 = vfmaq_f32(acc20, dot_f32_20, vdupq_n_f32(scale20)); + acc21 = vfmaq_f32(acc21, dot_f32_21, vdupq_n_f32(scale21)); + acc30 = vfmaq_f32(acc30, dot_f32_30, vdupq_n_f32(scale30)); + acc31 = vfmaq_f32(acc31, dot_f32_31, vdupq_n_f32(scale31)); + } // increment block data pointers to next sub-block QuantADataPtrRow0 += 32; QuantADataPtrRow1 += 32; + QuantADataPtrRow2 += 32; + QuantADataPtrRow3 += 32; QuantBDataPtr += 16; } @@ -407,16 +489,24 @@ SQ4BitGemm_CompInt8_Compute2x2_BlkLenGreaterThan16( } } - SumPtr[0] = vaddvq_f32(acc00); - SumPtr[1] = vaddvq_f32(acc01); - SumPtr[ldc + 0] = vaddvq_f32(acc10); - SumPtr[ldc + 1] = vaddvq_f32(acc11); + SumPtr[ldc * 0 + 0] = vaddvq_f32(acc00); + SumPtr[ldc * 0 + 1] = vaddvq_f32(acc01); + SumPtr[ldc * 1 + 0] = vaddvq_f32(acc10); + SumPtr[ldc * 1 + 1] = vaddvq_f32(acc11); + SumPtr[ldc * 2 + 0] = vaddvq_f32(acc20); + SumPtr[ldc * 2 + 1] = vaddvq_f32(acc21); + SumPtr[ldc * 3 + 0] = vaddvq_f32(acc30); + SumPtr[ldc * 3 + 1] = vaddvq_f32(acc31); if (BiasPtr != nullptr) { - SumPtr[0] += BiasPtr[0]; - SumPtr[1] += BiasPtr[1]; - SumPtr[ldc + 0] += BiasPtr[0]; - SumPtr[ldc + 1] += BiasPtr[1]; + SumPtr[ldc * 0 + 0] += BiasPtr[0]; + SumPtr[ldc * 0 + 1] += BiasPtr[1]; + SumPtr[ldc * 1 + 0] += BiasPtr[0]; + SumPtr[ldc * 1 + 1] += BiasPtr[1]; + SumPtr[ldc * 2 + 0] += BiasPtr[0]; + SumPtr[ldc * 2 + 1] += BiasPtr[1]; + SumPtr[ldc * 3 + 0] += BiasPtr[0]; + SumPtr[ldc * 3 + 1] += BiasPtr[1]; } } @@ -478,8 +568,8 @@ SQ4BitGemm_CompInt8_Compute1x1_BlkLen16( bv1 = vsubq_s8(bv1, bzp1); // quantized dot product - const int32x4_t dot0 = vdotq_s32(vdupq_n_s32(0), av0, bv0); - const int32x4_t dot1 = vdotq_s32(vdupq_n_s32(0), av1, bv1); + const int32x4_t dot0 = vdotq_s32(int32x4_t{}, av0, bv0); + const int32x4_t dot1 = vdotq_s32(int32x4_t{}, av1, bv1); // convert to float const float32x4_t dot_f32_0 = vcvtq_f32_s32(dot0); @@ -527,7 +617,7 @@ SQ4BitGemm_CompInt8_Compute1x1_BlkLen16( bv0 = vsubq_s8(bv0, bzp0); // quantized dot product - const int32x4_t dot0 = vdotq_s32(vdupq_n_s32(0), av0, bv0); + const int32x4_t dot0 = vdotq_s32(int32x4_t{}, av0, bv0); // convert to float const float32x4_t dot_f32_0 = vcvtq_f32_s32(dot0); @@ -604,9 +694,8 @@ SQ4BitGemm_CompInt8_Compute1x1_BlkLen32( bv_hi1 = vsubq_s8(bv_hi1, bzp1); // quantized dot product - int32x4_t dot0{}, dot1{}; - dot0 = vdotq_s32(vdotq_s32(dot0, av_lo0, bv_lo0), av_hi0, bv_hi0); - dot1 = vdotq_s32(vdotq_s32(dot1, av_lo1, bv_lo1), av_hi1, bv_hi1); + const int32x4_t dot0 = vdotq_s32(vdotq_s32(int32x4_t{}, av_lo0, bv_lo0), av_hi0, bv_hi0); + const int32x4_t dot1 = vdotq_s32(vdotq_s32(int32x4_t{}, av_lo1, bv_lo1), av_hi1, bv_hi1); // convert to float const float32x4_t dot_f32_0 = vcvtq_f32_s32(dot0); @@ -652,8 +741,7 @@ SQ4BitGemm_CompInt8_Compute1x1_BlkLen32( bv_hi0 = vsubq_s8(bv_hi0, bzp0); // quantized dot product - int32x4_t dot0{}; - dot0 = vdotq_s32(vdotq_s32(dot0, av_lo0, bv_lo0), av_hi0, bv_hi0); + const int32x4_t dot0 = vdotq_s32(vdotq_s32(int32x4_t{}, av_lo0, bv_lo0), av_hi0, bv_hi0); // convert to float const float32x4_t dot_f32_0 = vcvtq_f32_s32(dot0); @@ -736,9 +824,8 @@ SQ4BitGemm_CompInt8_Compute1x1_BlkLenGreaterThan32( bv3 = vsubq_s8(bv3, bzp); // quantized dot product - int32x4_t dot0{}, dot1{}; - dot0 = vdotq_s32(vdotq_s32(dot0, av0, bv0), av1, bv1); - dot1 = vdotq_s32(vdotq_s32(dot1, av2, bv2), av3, bv3); + const int32x4_t dot0 = vdotq_s32(vdotq_s32(int32x4_t{}, av0, bv0), av1, bv1); + const int32x4_t dot1 = vdotq_s32(vdotq_s32(int32x4_t{}, av2, bv2), av3, bv3); // convert to float const float32x4_t dot_f32_0 = vcvtq_f32_s32(dot0); @@ -834,7 +921,7 @@ SQ4BitGemmKernel_CompInt8_BlkLen16( float* SumRowPtr = C; size_t m_remaining = CountM; - while (m_remaining > 1) { + while (m_remaining > 3) { const std::byte* QuantBDataColPtr = QuantBData; const float* QuantBScaleColPtr = QuantBScale; const std::byte* QuantBZeroPointColPtr = QuantBZeroPoint; @@ -845,8 +932,8 @@ SQ4BitGemmKernel_CompInt8_BlkLen16( size_t n_remaining = CountN; while (n_remaining > 1) { - // Compute 2x2 tiles of output - SQ4BitGemm_CompInt8_Compute2x2_BlkLen16( + // Compute 4x2 tiles of output + SQ4BitGemm_CompInt8_Compute4x2_BlkLen16( QuantARowPtr, QuantBDataColPtr, QuantBScaleColPtr, @@ -871,38 +958,30 @@ SQ4BitGemmKernel_CompInt8_BlkLen16( } if (n_remaining > 0) { - // Compute last 2x1 tile of output - SQ4BitGemm_CompInt8_Compute1x1_BlkLen16( - QuantARowPtr, - QuantBDataColPtr, - QuantBScaleColPtr, - QuantBZeroPointColPtr, - BiasPtr, - SumPtr, - BlockCountK - ); - - SQ4BitGemm_CompInt8_Compute1x1_BlkLen16( - QuantARowPtr + StrideQuantA, - QuantBDataColPtr, - QuantBScaleColPtr, - QuantBZeroPointColPtr, - BiasPtr, - SumPtr + ldc, - BlockCountK - ); + // Compute last 4x1 tile of output + for (size_t i = 0; i < 4; ++i) { + SQ4BitGemm_CompInt8_Compute1x1_BlkLen16( + QuantARowPtr + StrideQuantA * i, + QuantBDataColPtr, + QuantBScaleColPtr, + QuantBZeroPointColPtr, + BiasPtr, + SumPtr + ldc * i, + BlockCountK + ); + } } - // Move to next 2 rows - AdvanceRowPtrs<2>( + // Move to next 4 rows + AdvanceRowPtrs<4>( StrideQuantA, ldc, QuantARowPtr, SumRowPtr ); - m_remaining -= 2; + m_remaining -= 4; } - if (m_remaining > 0) { + while (m_remaining > 0) { const std::byte* QuantBDataColPtr = QuantBData; const float* QuantBScaleColPtr = QuantBScale; const std::byte* QuantBZeroPointColPtr = QuantBZeroPoint; @@ -932,6 +1011,14 @@ SQ4BitGemmKernel_CompInt8_BlkLen16( n_remaining -= 1; } + + // Move to next row + AdvanceRowPtrs<1>( + StrideQuantA, ldc, + QuantARowPtr, SumRowPtr + ); + + m_remaining -= 1; } } @@ -964,7 +1051,7 @@ SQ4BitGemmKernel_CompInt8_BlkLen32( float* SumRowPtr = C; size_t m_remaining = CountM; - while (m_remaining > 1) { + while (m_remaining > 3) { const std::byte* QuantBDataColPtr = QuantBData; const float* QuantBScaleColPtr = QuantBScale; const std::byte* QuantBZeroPointColPtr = QuantBZeroPoint; @@ -975,8 +1062,8 @@ SQ4BitGemmKernel_CompInt8_BlkLen32( size_t n_remaining = CountN; while (n_remaining > 1) { - // Compute 2x2 tiles of output - SQ4BitGemm_CompInt8_Compute2x2_BlkLenGreaterThan16( + // Compute 4x2 tiles of output + SQ4BitGemm_CompInt8_Compute4x2_BlkLenGreaterThan16( BlkLen, QuantARowPtr, QuantBDataColPtr, @@ -1002,38 +1089,30 @@ SQ4BitGemmKernel_CompInt8_BlkLen32( } if (n_remaining > 0) { - // Compute last 2x1 tile of output - SQ4BitGemm_CompInt8_Compute1x1_BlkLen32( - QuantARowPtr, - QuantBDataColPtr, - QuantBScaleColPtr, - QuantBZeroPointColPtr, - BiasPtr, - SumPtr, - BlockCountK - ); - - SQ4BitGemm_CompInt8_Compute1x1_BlkLen32( - QuantARowPtr + StrideQuantA, - QuantBDataColPtr, - QuantBScaleColPtr, - QuantBZeroPointColPtr, - BiasPtr, - SumPtr + ldc, - BlockCountK - ); + // Compute last 4x1 tile of output + for (size_t i = 0; i < 4; ++i) { + SQ4BitGemm_CompInt8_Compute1x1_BlkLen32( + QuantARowPtr + StrideQuantA * i, + QuantBDataColPtr, + QuantBScaleColPtr, + QuantBZeroPointColPtr, + BiasPtr, + SumPtr + ldc * i, + BlockCountK + ); + } } - // Move to next 2 rows - AdvanceRowPtrs<2>( + // Move to next 4 rows + AdvanceRowPtrs<4>( StrideQuantA, ldc, QuantARowPtr, SumRowPtr ); - m_remaining -= 2; + m_remaining -= 4; } - if (m_remaining > 0) { + while (m_remaining > 0) { const std::byte* QuantBDataColPtr = QuantBData; const float* QuantBScaleColPtr = QuantBScale; const std::byte* QuantBZeroPointColPtr = QuantBZeroPoint; @@ -1063,6 +1142,14 @@ SQ4BitGemmKernel_CompInt8_BlkLen32( n_remaining -= 1; } + + // Move to next row + AdvanceRowPtrs<1>( + StrideQuantA, ldc, + QuantARowPtr, SumRowPtr + ); + + m_remaining -= 1; } } @@ -1095,7 +1182,7 @@ SQ4BitGemmKernel_CompInt8_BlkLenGreaterThan32( float* SumRowPtr = C; size_t m_remaining = CountM; - while (m_remaining > 1) { + while (m_remaining > 3) { const std::byte* QuantBDataColPtr = QuantBData; const float* QuantBScaleColPtr = QuantBScale; const std::byte* QuantBZeroPointColPtr = QuantBZeroPoint; @@ -1106,8 +1193,8 @@ SQ4BitGemmKernel_CompInt8_BlkLenGreaterThan32( size_t n_remaining = CountN; while (n_remaining > 1) { - // Compute 2x2 tiles of output - SQ4BitGemm_CompInt8_Compute2x2_BlkLenGreaterThan16( + // Compute 4x2 tiles of output + SQ4BitGemm_CompInt8_Compute4x2_BlkLenGreaterThan16( BlkLen, QuantARowPtr, QuantBDataColPtr, @@ -1133,40 +1220,31 @@ SQ4BitGemmKernel_CompInt8_BlkLenGreaterThan32( } if (n_remaining > 0) { - // Compute last 2x1 tile of output - SQ4BitGemm_CompInt8_Compute1x1_BlkLenGreaterThan32( - BlkLen, - QuantARowPtr, - QuantBDataColPtr, - QuantBScaleColPtr, - QuantBZeroPointColPtr, - BiasPtr, - SumPtr, - BlockCountK - ); - - SQ4BitGemm_CompInt8_Compute1x1_BlkLenGreaterThan32( - BlkLen, - QuantARowPtr + StrideQuantA, - QuantBDataColPtr, - QuantBScaleColPtr, - QuantBZeroPointColPtr, - BiasPtr, - SumPtr + ldc, - BlockCountK - ); + // Compute last 4x1 tile of output + for (size_t i = 0; i < 4; ++i) { + SQ4BitGemm_CompInt8_Compute1x1_BlkLenGreaterThan32( + BlkLen, + QuantARowPtr + StrideQuantA * i, + QuantBDataColPtr, + QuantBScaleColPtr, + QuantBZeroPointColPtr, + BiasPtr, + SumPtr + ldc * i, + BlockCountK + ); + } } - // Move to next 2 rows - AdvanceRowPtrs<2>( + // Move to next 4 rows + AdvanceRowPtrs<4>( StrideQuantA, ldc, QuantARowPtr, SumRowPtr ); - m_remaining -= 2; + m_remaining -= 4; } - if (m_remaining > 0) { + while (m_remaining > 0) { const std::byte* QuantBDataColPtr = QuantBData; const float* QuantBScaleColPtr = QuantBScale; const std::byte* QuantBZeroPointColPtr = QuantBZeroPoint; @@ -1197,6 +1275,14 @@ SQ4BitGemmKernel_CompInt8_BlkLenGreaterThan32( n_remaining -= 1; } + + // Move to next row + AdvanceRowPtrs<1>( + StrideQuantA, ldc, + QuantARowPtr, SumRowPtr + ); + + m_remaining -= 1; } } diff --git a/onnxruntime/core/optimizer/attention_fusion.cc b/onnxruntime/core/optimizer/attention_fusion.cc index b88f2d6a4637e..08066f030a381 100644 --- a/onnxruntime/core/optimizer/attention_fusion.cc +++ b/onnxruntime/core/optimizer/attention_fusion.cc @@ -126,7 +126,7 @@ static NodeArg& MergeQkvWeights(Graph& graph, int64_t hidden_size, } else { MergeWeights(q_weight, k_weight, v_weight, result, hidden_size); } - initializer.set_raw_data(result.data(), gsl::narrow(element_count) * sizeof(float)); + utils::SetRawDataInTensorProto(initializer, result.data(), gsl::narrow(element_count) * sizeof(float)); } else { // data_type == ONNX_NAMESPACE::TensorProto_DataType_FLOAT16 const MLFloat16* q_weight = q_initializer.data(); const MLFloat16* k_weight = k_initializer.data(); @@ -138,7 +138,7 @@ static NodeArg& MergeQkvWeights(Graph& graph, int64_t hidden_size, } else { MergeWeights(q_weight, k_weight, v_weight, result, hidden_size); } - initializer.set_raw_data(result.data(), gsl::narrow(element_count) * sizeof(MLFloat16)); + utils::SetRawDataInTensorProto(initializer, result.data(), gsl::narrow(element_count) * sizeof(MLFloat16)); } return graph_utils::AddInitializer(graph, initializer); diff --git a/onnxruntime/core/optimizer/compute_optimizer/shared_utils.cc b/onnxruntime/core/optimizer/compute_optimizer/shared_utils.cc index 913f3b6811183..86a7a4d6afbf8 100644 --- a/onnxruntime/core/optimizer/compute_optimizer/shared_utils.cc +++ b/onnxruntime/core/optimizer/compute_optimizer/shared_utils.cc @@ -188,7 +188,7 @@ NodeArg* CreateInitializerFromVector(Graph& graph, "The total count of dims does not match the size of values. ", "total_count: ", total_count, " values.size(): ", values.size()); - const_tensor.set_raw_data(values.data(), values.size() * sizeof(int64_t)); + utils::SetRawDataInTensorProto(const_tensor, values.data(), values.size() * sizeof(int64_t)); return &graph_utils::AddInitializer(graph, const_tensor); } diff --git a/onnxruntime/core/optimizer/constant_folding.cc b/onnxruntime/core/optimizer/constant_folding.cc index 9df300d6f4f88..1466de51d0b99 100644 --- a/onnxruntime/core/optimizer/constant_folding.cc +++ b/onnxruntime/core/optimizer/constant_folding.cc @@ -82,8 +82,7 @@ static bool ConstantFoldShapeNode(Graph& graph, Node& node) { shape_constant.set_name(constant_arg_out->Name()); shape_constant.set_data_type(ONNX_NAMESPACE::TensorProto_DataType_INT64); shape_constant.add_dims(clamped_slice_length); - shape_constant.set_raw_data(dim_values.data() + start, - clamped_slice_length * sizeof(int64_t)); + utils::SetRawDataInTensorProto(shape_constant, dim_values.data() + start, clamped_slice_length * sizeof(int64_t)); ONNX_NAMESPACE::TensorShapeProto result_shape; result_shape.add_dim()->set_dim_value(clamped_slice_length); constant_arg_out->SetShape(result_shape); diff --git a/onnxruntime/core/optimizer/embed_layer_norm_fusion.cc b/onnxruntime/core/optimizer/embed_layer_norm_fusion.cc index 7b6f829b7a0a4..e8e395678436e 100644 --- a/onnxruntime/core/optimizer/embed_layer_norm_fusion.cc +++ b/onnxruntime/core/optimizer/embed_layer_norm_fusion.cc @@ -465,15 +465,13 @@ static NodeArg* ExtractEmbedding(Graph& graph, if (!CheckEmbeddingData(data, batch_size, element_count)) { return nullptr; } - - initializer.set_raw_data(data, gsl::narrow(element_count) * sizeof(float)); + utils::SetRawDataInTensorProto(initializer, data, gsl::narrow(element_count) * sizeof(float)); } else { // data_type == ONNX_NAMESPACE::TensorProto_DataType_FLOAT16 const MLFloat16* data = old_initializer.data(); if (!CheckEmbeddingData(data, batch_size, element_count)) { return nullptr; } - - initializer.set_raw_data(data, gsl::narrow(element_count) * sizeof(MLFloat16)); + utils::SetRawDataInTensorProto(initializer, data, gsl::narrow(element_count) * sizeof(MLFloat16)); } NodeArg& node_arg = graph_utils::AddInitializer(graph, initializer); diff --git a/onnxruntime/core/optimizer/nchwc_transformer.cc b/onnxruntime/core/optimizer/nchwc_transformer.cc index 2b29473f876c3..46f306b92bed5 100644 --- a/onnxruntime/core/optimizer/nchwc_transformer.cc +++ b/onnxruntime/core/optimizer/nchwc_transformer.cc @@ -428,7 +428,8 @@ void NchwcTransformerImpl::TransformConv(Node& node) { nchwc_conv_W_tensor_proto.set_data_type(ONNX_NAMESPACE::TensorProto_DataType_FLOAT); nchwc_conv_W_tensor_proto.set_name(graph_.GenerateNodeArgName("reorder")); - nchwc_conv_W_tensor_proto.set_raw_data(reordered_filter.data(), reordered_filter.size() * sizeof(float)); + utils::SetRawDataInTensorProto(nchwc_conv_W_tensor_proto, reordered_filter.data(), + reordered_filter.size() * sizeof(float)); nchwc_conv_W_tensor_proto.add_dims(nchwc_output_channels); nchwc_conv_W_tensor_proto.add_dims(filter_input_channels); @@ -458,7 +459,8 @@ void NchwcTransformerImpl::TransformConv(Node& node) { nchwc_conv_B_tensor_proto.set_data_type(ONNX_NAMESPACE::TensorProto_DataType_FLOAT); nchwc_conv_B_tensor_proto.set_name(graph_.GenerateNodeArgName("reorder")); - nchwc_conv_B_tensor_proto.set_raw_data(aligned_bias.data(), gsl::narrow(nchwc_output_channels) * sizeof(float)); + utils::SetRawDataInTensorProto(nchwc_conv_B_tensor_proto, aligned_bias.data(), + gsl::narrow(nchwc_output_channels) * sizeof(float)); nchwc_conv_B_tensor_proto.add_dims(nchwc_output_channels); @@ -883,7 +885,8 @@ void NchwcTransformerImpl::TransformBatchNormalization(Node& node) { ONNX_NAMESPACE::TensorProto nchwc_conv_W_tensor_proto; nchwc_conv_W_tensor_proto.set_data_type(ONNX_NAMESPACE::TensorProto_DataType_FLOAT); nchwc_conv_W_tensor_proto.set_name(graph_.GenerateNodeArgName("bn_scale")); - nchwc_conv_W_tensor_proto.set_raw_data(padded_buffer.data(), gsl::narrow(nchwc_channels) * sizeof(float)); + utils::SetRawDataInTensorProto(nchwc_conv_W_tensor_proto, padded_buffer.data(), + gsl::narrow(nchwc_channels) * sizeof(float)); nchwc_conv_W_tensor_proto.add_dims(nchwc_channels); nchwc_conv_W_tensor_proto.add_dims(1); nchwc_conv_W_tensor_proto.add_dims(1); @@ -896,7 +899,8 @@ void NchwcTransformerImpl::TransformBatchNormalization(Node& node) { ONNX_NAMESPACE::TensorProto nchwc_conv_B_tensor_proto; nchwc_conv_B_tensor_proto.set_data_type(ONNX_NAMESPACE::TensorProto_DataType_FLOAT); nchwc_conv_B_tensor_proto.set_name(graph_.GenerateNodeArgName("bn_B")); - nchwc_conv_B_tensor_proto.set_raw_data(padded_buffer.data(), gsl::narrow(nchwc_channels) * sizeof(float)); + utils::SetRawDataInTensorProto(nchwc_conv_B_tensor_proto, padded_buffer.data(), + gsl::narrow(nchwc_channels) * sizeof(float)); nchwc_conv_B_tensor_proto.add_dims(nchwc_channels); auto* nchwc_conv_B_arg = &graph_utils::AddInitializer(graph_, nchwc_conv_B_tensor_proto); diff --git a/onnxruntime/core/optimizer/qdq_transformer/avx2_weight_s8_to_u8.cc b/onnxruntime/core/optimizer/qdq_transformer/avx2_weight_s8_to_u8.cc index 6f0f38b1de56e..18e462c04dff3 100644 --- a/onnxruntime/core/optimizer/qdq_transformer/avx2_weight_s8_to_u8.cc +++ b/onnxruntime/core/optimizer/qdq_transformer/avx2_weight_s8_to_u8.cc @@ -129,7 +129,7 @@ static bool TryConvertDynamicQuantizeLSTM(Node& op_node, Graph& graph) { weights_proto_u8.set_data_type(ONNX_NAMESPACE::TensorProto_DataType_UINT8); weights_proto_u8.set_name(weight_tensor_proto->name() + "_s8_2_u8"); weights_proto_u8.mutable_dims()->CopyFrom(weight_tensor_proto->dims()); - weights_proto_u8.set_raw_data(w_temp.data(), static_cast(w_temp.size())); + utils::SetRawDataInTensorProto(weights_proto_u8, w_temp.data(), static_cast(w_temp.size())); input_defs[w_idx] = &graph_utils::AddInitializer(graph, weights_proto_u8); ONNX_NAMESPACE::TensorProto weight_zp_proto_u8; @@ -140,7 +140,7 @@ static bool TryConvertDynamicQuantizeLSTM(Node& op_node, Graph& graph) { r_proto_u8.set_data_type(ONNX_NAMESPACE::TensorProto_DataType_UINT8); r_proto_u8.set_name(r_tensor_proto->name() + "_s8_2_u8"); r_proto_u8.mutable_dims()->CopyFrom(r_tensor_proto->dims()); - r_proto_u8.set_raw_data(r_temp.data(), static_cast(r_temp.size())); + utils::SetRawDataInTensorProto(r_proto_u8, r_temp.data(), static_cast(r_temp.size())); input_defs[r_idx] = &graph_utils::AddInitializer(graph, r_proto_u8); ONNX_NAMESPACE::TensorProto r_zp_proto_u8; diff --git a/onnxruntime/core/optimizer/qdq_transformer/qdq_s8_to_u8.cc b/onnxruntime/core/optimizer/qdq_transformer/qdq_s8_to_u8.cc index 199fbffc9f723..f2033dcbc1b03 100644 --- a/onnxruntime/core/optimizer/qdq_transformer/qdq_s8_to_u8.cc +++ b/onnxruntime/core/optimizer/qdq_transformer/qdq_s8_to_u8.cc @@ -60,7 +60,7 @@ static bool QDQ_S8_to_U8(Graph& graph, Node& q_node, Node& dq_node) { ONNX_NAMESPACE::TensorProto zp_tensor_proto_u8; zp_tensor_proto_u8.set_data_type(ONNX_NAMESPACE::TensorProto_DataType_UINT8); zp_tensor_proto_u8.set_name(graph.GenerateNodeArgName("qdq_s8_to_u8_zp_conversion")); - zp_tensor_proto_u8.set_raw_data(&q_zp_value, sizeof(uint8_t)); + utils::SetRawDataInTensorProto(zp_tensor_proto_u8, &q_zp_value, sizeof(uint8_t)); NodeArg* zp_u8_arg = &graph_utils::AddInitializer(graph, zp_tensor_proto_u8); auto q_output_node_arg_name = graph.GenerateNodeArgName("qdq_s8_to_u8_quant"); diff --git a/onnxruntime/core/optimizer/qdq_transformer/s8_to_u8.h b/onnxruntime/core/optimizer/qdq_transformer/s8_to_u8.h index 6caa35ea61ed7..1c1341fe5a127 100644 --- a/onnxruntime/core/optimizer/qdq_transformer/s8_to_u8.h +++ b/onnxruntime/core/optimizer/qdq_transformer/s8_to_u8.h @@ -27,7 +27,7 @@ inline bool Int8TensorProto2Uint8( if (nullptr == src) { uint8_t zero_val = 128; dst.set_name(graph.GenerateNodeArgName("weight_zp_s8_2_u8")); - dst.set_raw_data(&zero_val, sizeof(uint8_t)); + utils::SetRawDataInTensorProto(dst, &zero_val, sizeof(uint8_t)); return true; } @@ -58,7 +58,7 @@ inline bool Int8TensorProto2Uint8( p++; } if (force || should_convert) { - dst.set_raw_data(temp.data(), size_t(temp.size())); + utils::SetRawDataInTensorProto(dst, temp.data(), size_t(temp.size())); return true; } return false; diff --git a/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_actions.cc b/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_actions.cc index 3d2a81ce7f8cd..3497ea4c85523 100644 --- a/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_actions.cc +++ b/onnxruntime/core/optimizer/qdq_transformer/selectors_actions/qdq_actions.cc @@ -5,6 +5,7 @@ #include "core/optimizer/qdq_transformer/qdq_util.h" #include "core/graph/node_attr_utils.h" +#include "core/framework/tensorprotoutils.h" namespace onnxruntime { namespace QDQ { @@ -132,7 +133,7 @@ struct SetOptionalZeroPoint { ONNX_NAMESPACE::TensorProto tensor_proto; tensor_proto.set_name(name); tensor_proto.set_data_type(ONNX_NAMESPACE::TensorProto_DataType_INT8); - tensor_proto.set_raw_data(a.data(), sizeof(int8_t)); + onnxruntime::utils::SetRawDataInTensorProto(tensor_proto, a.data(), sizeof(int8_t)); return tensor_proto; }; @@ -145,8 +146,7 @@ struct SetOptionalZeroPoint { ONNX_NAMESPACE::TensorProto tensor_proto; tensor_proto.set_name(name); tensor_proto.set_data_type(ONNX_NAMESPACE::TensorProto_DataType_UINT8); - tensor_proto.set_raw_data(a.data(), sizeof(uint8_t)); - + onnxruntime::utils::SetRawDataInTensorProto(tensor_proto, a.data(), sizeof(uint8_t)); return tensor_proto; }; static ONNX_NAMESPACE::TensorProto GetOptionalZeroPointInt8() { diff --git a/onnxruntime/core/optimizer/reshape_fusion.cc b/onnxruntime/core/optimizer/reshape_fusion.cc index 7768a835d5042..7f94e18458be2 100644 --- a/onnxruntime/core/optimizer/reshape_fusion.cc +++ b/onnxruntime/core/optimizer/reshape_fusion.cc @@ -435,7 +435,7 @@ bool ReshapeFusion::Fuse_Subgraph(Node& reshape, Graph& graph, const logging::Lo shape_initializer_proto.set_name(shape_def->Name()); shape_initializer_proto.add_dims(static_cast(shape_value.size())); shape_initializer_proto.set_data_type(ONNX_NAMESPACE::TensorProto_DataType_INT64); - shape_initializer_proto.set_raw_data(shape_value.data(), shape_value.size() * sizeof(int64_t)); + utils::SetRawDataInTensorProto(shape_initializer_proto, shape_value.data(), shape_value.size() * sizeof(int64_t)); auto& new_node_arg = graph_utils::AddInitializer(graph, shape_initializer_proto); // Safely remove concat parent nodes which have only one output diff --git a/onnxruntime/core/optimizer/stft_decomposition.cc b/onnxruntime/core/optimizer/stft_decomposition.cc index a54904ff15e1e..5c09e5225ab9c 100644 --- a/onnxruntime/core/optimizer/stft_decomposition.cc +++ b/onnxruntime/core/optimizer/stft_decomposition.cc @@ -45,7 +45,7 @@ NodeArg* AddInitializer(Graph& graph, const char* name, const int64_t (&shape)[T element_count *= shape[i]; proto.add_dims(shape[i]); } - proto.set_raw_data(begin, element_count * sizeof(TDataType)); + utils::SetRawDataInTensorProto(proto, begin, element_count * sizeof(TDataType)); return &graph_utils::AddInitializer(graph, proto); } diff --git a/onnxruntime/core/optimizer/transpose_optimization/ort_optimizer_api_impl.cc b/onnxruntime/core/optimizer/transpose_optimization/ort_optimizer_api_impl.cc index 1f7e54cb807ea..f756d01413eae 100644 --- a/onnxruntime/core/optimizer/transpose_optimization/ort_optimizer_api_impl.cc +++ b/onnxruntime/core/optimizer/transpose_optimization/ort_optimizer_api_impl.cc @@ -766,10 +766,10 @@ std::string_view ApiGraph::AddInitializer(api::DataType dtype, const std::vector ONNX_NAMESPACE::TensorProto tensor_proto; tensor_proto.set_data_type(gsl::narrow_cast(dtype)); tensor_proto.set_name(name); - tensor_proto.set_raw_data(data.data(), data.size()); for (int64_t dim : shape) { tensor_proto.add_dims(dim); } + utils::SetRawDataInTensorProto(tensor_proto, data.data(), data.size()); const auto& node_arg = graph_utils::AddInitializer(graph_, tensor_proto); return node_arg.Name(); diff --git a/onnxruntime/core/platform/path_lib.h b/onnxruntime/core/platform/path_lib.h index a9d89f32e91d3..fca8990f14821 100644 --- a/onnxruntime/core/platform/path_lib.h +++ b/onnxruntime/core/platform/path_lib.h @@ -281,7 +281,7 @@ void LoopDir(const std::string& dir_name, T func) { ORT_TRY { struct dirent* dp; while ((dp = readdir(dir)) != nullptr) { - std::basic_string filename = ConcatPathComponent(dir_name, dp->d_name); + std::basic_string filename = ConcatPathComponent(dir_name, dp->d_name); if (stat(filename.c_str(), &stats) != 0) { continue; } diff --git a/onnxruntime/core/platform/posix/env.cc b/onnxruntime/core/platform/posix/env.cc index ec06320438977..04cf5ff6a3329 100644 --- a/onnxruntime/core/platform/posix/env.cc +++ b/onnxruntime/core/platform/posix/env.cc @@ -26,7 +26,9 @@ limitations under the License. #include #include #include +#if !defined(_AIX) #include +#endif #include #include diff --git a/onnxruntime/core/platform/windows/logging/etw_sink.cc b/onnxruntime/core/platform/windows/logging/etw_sink.cc index 2a74e22850658..b0f9eaf4f62d2 100644 --- a/onnxruntime/core/platform/windows/logging/etw_sink.cc +++ b/onnxruntime/core/platform/windows/logging/etw_sink.cc @@ -98,6 +98,10 @@ ULONGLONG EtwRegistrationManager::Keyword() const { return keyword_; } +HRESULT EtwRegistrationManager::Status() const { + return etw_status_; +} + void EtwRegistrationManager::RegisterInternalCallback(const EtwInternalCallback& callback) { std::lock_guard lock(callbacks_mutex_); callbacks_.push_back(&callback); @@ -140,9 +144,15 @@ EtwRegistrationManager::EtwRegistrationManager() { } void EtwRegistrationManager::LazyInitialize() { - static HRESULT etw_status = ::TraceLoggingRegisterEx(etw_provider_handle, ORT_TL_EtwEnableCallback, nullptr); - if (FAILED(etw_status)) { - ORT_THROW("ETW registration failed. Logging will be broken: " + std::to_string(etw_status)); + if (!initialized_) { + std::lock_guard lock(init_mutex_); + if (!initialized_) { // Double-check locking pattern + initialized_ = true; + etw_status_ = ::TraceLoggingRegisterEx(etw_provider_handle, ORT_TL_EtwEnableCallback, nullptr); + if (FAILED(etw_status_)) { + ORT_THROW("ETW registration failed. Logging will be broken: " + std::to_string(etw_status_)); + } + } } } @@ -161,6 +171,12 @@ void EtwSink::SendImpl(const Timestamp& timestamp, const std::string& logger_id, // register on first usage static EtwRegistrationManager& etw_manager = EtwRegistrationManager::Instance(); + // do something (not that meaningful) with etw_manager so it doesn't get optimized out + // as we want an instance around to do the unregister + if (FAILED(etw_manager.Status())) { + return; + } + // TODO: Validate if this filtering makes sense. if (message.DataType() == DataType::USER) { return; diff --git a/onnxruntime/core/platform/windows/logging/etw_sink.h b/onnxruntime/core/platform/windows/logging/etw_sink.h index ff68aec0b7d64..3af45b813a625 100644 --- a/onnxruntime/core/platform/windows/logging/etw_sink.h +++ b/onnxruntime/core/platform/windows/logging/etw_sink.h @@ -66,6 +66,9 @@ class EtwRegistrationManager { // Get the current keyword uint64_t Keyword() const; + // Get the ETW registration status + HRESULT Status() const; + void RegisterInternalCallback(const EtwInternalCallback& callback); void UnregisterInternalCallback(const EtwInternalCallback& callback); @@ -97,6 +100,7 @@ class EtwRegistrationManager { bool is_enabled_; UCHAR level_; ULONGLONG keyword_; + HRESULT etw_status_; }; } // namespace logging diff --git a/onnxruntime/core/providers/coreml/builders/impl/activation_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/activation_op_builder.cc index dee87ce3632a8..0e21715513707 100644 --- a/onnxruntime/core/providers/coreml/builders/impl/activation_op_builder.cc +++ b/onnxruntime/core/providers/coreml/builders/impl/activation_op_builder.cc @@ -26,6 +26,8 @@ class ActivationOpBuilder : public BaseOpBuilder { const logging::Logger& logger) const override; int GetMinSupportedOpSet(const Node& node) const override; + + bool SupportsMLProgram() const override { return true; } }; void ActivationOpBuilder::AddInitializersToSkip(ModelBuilder& model_builder, const Node& node) const { @@ -74,33 +76,61 @@ Status AddPReluWeight(ModelBuilder& model_builder, const Node& node, Status ActivationOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const Node& node, const logging::Logger& logger) const { - std::unique_ptr layer = model_builder.CreateNNLayer(node); - const auto& op_type(node.OpType()); - if (op_type == "Sigmoid") { - layer->mutable_activation()->mutable_sigmoid(); - } else if (op_type == "Tanh") { - layer->mutable_activation()->mutable_tanh(); - } else if (op_type == "Relu") { - layer->mutable_activation()->mutable_relu(); - } else if (op_type == "PRelu") { - auto* prelu = layer->mutable_activation()->mutable_prelu(); - ORT_RETURN_IF_ERROR(AddPReluWeight(model_builder, node, logger, *prelu)); - } else if (op_type == "LeakyRelu") { - NodeAttrHelper helper(node); - const auto alpha = helper.Get("alpha", 0.01f); - - auto* leaky_relu = layer->mutable_activation()->mutable_leakyrelu(); - leaky_relu->set_alpha(alpha); - } else { - return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, - "ActivationOpBuilder::AddToModelBuilderImpl, unknown op: ", op_type); - } - *layer->mutable_input()->Add() = node.InputDefs()[0]->Name(); - *layer->mutable_output()->Add() = node.OutputDefs()[0]->Name(); +#if defined(COREML_ENABLE_MLPROGRAM) + if (model_builder.CreateMLProgram()) { + using namespace CoreML::Specification::MILSpec; + // https://apple.github.io/coremltools/source/coremltools.converters.mil.mil.ops.defs.html#module-coremltools.converters.mil.mil.ops.defs.iOS15.activation + std::string_view coreml_op_type; + if (op_type == "Sigmoid") { + coreml_op_type = "sigmoid"; + } else if (op_type == "Tanh") { + coreml_op_type = "tanh"; + } else if (op_type == "Relu") { + coreml_op_type = "relu"; + } else { + return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, + "ActivationOpBuilder::AddToModelBuilderImpl, unknown op: ", op_type); + } + + std::unique_ptr op = model_builder.CreateOperation(node, coreml_op_type); + AddOperationInput(*op, "x", node.InputDefs()[0]->Name()); + AddOperationOutput(*op, *node.OutputDefs()[0]); + + model_builder.AddOperation(std::move(op)); + + } else +#endif // (COREML_ENABLE_MLPROGRAM) + { + std::unique_ptr layer = model_builder.CreateNNLayer(node); + + if (op_type == "Sigmoid") { + layer->mutable_activation()->mutable_sigmoid(); + } else if (op_type == "Tanh") { + layer->mutable_activation()->mutable_tanh(); + } else if (op_type == "Relu") { + layer->mutable_activation()->mutable_relu(); + } else if (op_type == "PRelu") { + auto* prelu = layer->mutable_activation()->mutable_prelu(); + ORT_RETURN_IF_ERROR(AddPReluWeight(model_builder, node, logger, *prelu)); + } else if (op_type == "LeakyRelu") { + NodeAttrHelper helper(node); + const auto alpha = helper.Get("alpha", 0.01f); + + auto* leaky_relu = layer->mutable_activation()->mutable_leakyrelu(); + leaky_relu->set_alpha(alpha); + } else { + return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, + "ActivationOpBuilder::AddToModelBuilderImpl, unknown op: ", op_type); + } + + *layer->mutable_input()->Add() = node.InputDefs()[0]->Name(); + *layer->mutable_output()->Add() = node.OutputDefs()[0]->Name(); + + model_builder.AddLayer(std::move(layer)); + } - model_builder.AddLayer(std::move(layer)); return Status::OK(); } @@ -165,9 +195,20 @@ bool IsPReluOpSupported(const Node& node, const OpBuilderInputParams& input_para bool ActivationOpBuilder::IsOpSupportedImpl(const Node& node, const OpBuilderInputParams& input_params, const logging::Logger& logger) const { const auto& op_type = node.OpType(); - if (op_type == "PRelu") { - return IsPReluOpSupported(node, input_params, logger); + +#if defined(COREML_ENABLE_MLPROGRAM) + if (input_params.create_mlprogram) { + if (op_type == "PRelu" || op_type == "LeakyRelu") { + return false; + } + } else +#endif // (COREML_ENABLE_MLPROGRAM) + { + if (op_type == "PRelu") { + return IsPReluOpSupported(node, input_params, logger); + } } + return true; } diff --git a/onnxruntime/core/providers/coreml/builders/impl/transpose_op_builder.cc b/onnxruntime/core/providers/coreml/builders/impl/transpose_op_builder.cc index f6a61d55a3d63..831c4cf4d08ba 100644 --- a/onnxruntime/core/providers/coreml/builders/impl/transpose_op_builder.cc +++ b/onnxruntime/core/providers/coreml/builders/impl/transpose_op_builder.cc @@ -3,6 +3,7 @@ #include "core/providers/coreml/builders/helper.h" #include "core/providers/coreml/builders/impl/base_op_builder.h" +#include "core/providers/coreml/builders/impl/builder_utils.h" #include "core/providers/coreml/builders/model_builder.h" #include "core/providers/coreml/builders/op_builder_factory.h" #include "core/providers/coreml/shape_utils.h" @@ -14,13 +15,13 @@ namespace coreml { class TransposeOpBuilder : public BaseOpBuilder { Status AddToModelBuilderImpl(ModelBuilder& model_builder, const Node& node, const logging::Logger& logger) const override; + + bool SupportsMLProgram() const override { return true; } }; Status TransposeOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const Node& node, const logging::Logger& logger) const { - std::unique_ptr layer = model_builder.CreateNNLayer(node); - NodeAttrHelper helper(node); std::vector perm = helper.Get("perm", std::vector()); std::vector input_shape; @@ -33,12 +34,27 @@ Status TransposeOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, ORT_RETURN_IF_NOT(perm.size() == input_dims, "Perm and input should have same dimension"); } - *layer->mutable_transpose()->mutable_axes() = {perm.cbegin(), perm.cend()}; +#if defined(COREML_ENABLE_MLPROGRAM) + if (model_builder.CreateMLProgram()) { + using namespace CoreML::Specification::MILSpec; + + std::unique_ptr op = model_builder.CreateOperation(node, "transpose"); + AddOperationInput(*op, "x", node.InputDefs()[0]->Name()); + AddOperationInput(*op, "perm", model_builder.AddConstant(op->type(), "perm", perm)); + AddOperationOutput(*op, *node.OutputDefs()[0]); + model_builder.AddOperation(std::move(op)); - *layer->mutable_input()->Add() = node.InputDefs()[0]->Name(); - *layer->mutable_output()->Add() = node.OutputDefs()[0]->Name(); + } else +#endif // defined(COREML_ENABLE_MLPROGRAM) + { + std::unique_ptr layer = model_builder.CreateNNLayer(node); + *layer->mutable_transpose()->mutable_axes() = {perm.cbegin(), perm.cend()}; - model_builder.AddLayer(std::move(layer)); + *layer->mutable_input()->Add() = node.InputDefs()[0]->Name(); + *layer->mutable_output()->Add() = node.OutputDefs()[0]->Name(); + + model_builder.AddLayer(std::move(layer)); + } return Status::OK(); } diff --git a/onnxruntime/contrib_ops/cuda/grid_sample.cc b/onnxruntime/core/providers/cuda/tensor/grid_sample.cc similarity index 100% rename from onnxruntime/contrib_ops/cuda/grid_sample.cc rename to onnxruntime/core/providers/cuda/tensor/grid_sample.cc diff --git a/onnxruntime/contrib_ops/cuda/grid_sample.h b/onnxruntime/core/providers/cuda/tensor/grid_sample.h similarity index 100% rename from onnxruntime/contrib_ops/cuda/grid_sample.h rename to onnxruntime/core/providers/cuda/tensor/grid_sample.h diff --git a/onnxruntime/contrib_ops/cuda/grid_sample_impl.cu b/onnxruntime/core/providers/cuda/tensor/grid_sample_impl.cu similarity index 100% rename from onnxruntime/contrib_ops/cuda/grid_sample_impl.cu rename to onnxruntime/core/providers/cuda/tensor/grid_sample_impl.cu diff --git a/onnxruntime/contrib_ops/cuda/grid_sample_impl.h b/onnxruntime/core/providers/cuda/tensor/grid_sample_impl.h similarity index 100% rename from onnxruntime/contrib_ops/cuda/grid_sample_impl.h rename to onnxruntime/core/providers/cuda/tensor/grid_sample_impl.h diff --git a/onnxruntime/core/providers/migraphx/gpu_data_transfer.cc b/onnxruntime/core/providers/migraphx/gpu_data_transfer.cc index 72193ef6268c1..94480c308b99f 100644 --- a/onnxruntime/core/providers/migraphx/gpu_data_transfer.cc +++ b/onnxruntime/core/providers/migraphx/gpu_data_transfer.cc @@ -60,17 +60,7 @@ common::Status GPUDataTransfer::CopyTensorAsync(const Tensor& src, Tensor& dst, HIP_CALL_THROW(hipMemcpy(dst_data, src_data, bytes, hipMemcpyHostToDevice)); } } else if (src_device.Type() == OrtDevice::GPU) { -#ifndef MIGRAPHX_STREAM_SYNC - if (dst_device.Type() == OrtDevice::CPU && dst_device.MemType() == OrtDevice::MemType::HIP_PINNED) { - // copying from GPU to pinned memory, this is non-blocking - HIP_CALL_THROW(hipMemcpyAsync(dst_data, src_data, bytes, hipMemcpyDeviceToHost, static_cast(stream.GetHandle()))); - } else { - // copying from GPU to CPU memory, this is blocking - HIP_CALL_THROW(hipMemcpy(dst_data, src_data, bytes, hipMemcpyDeviceToHost)); - } -#else HIP_CALL_THROW(hipMemcpyAsync(dst_data, src_data, bytes, hipMemcpyDeviceToHost, static_cast(stream.GetHandle()))); -#endif } else { // copying between cpu memory memcpy(dst_data, src_data, bytes); diff --git a/onnxruntime/core/providers/migraphx/hip_allocator.cc b/onnxruntime/core/providers/migraphx/migraphx_allocator.cc similarity index 83% rename from onnxruntime/core/providers/migraphx/hip_allocator.cc rename to onnxruntime/core/providers/migraphx/migraphx_allocator.cc index 53f10e318e65f..0693eea056416 100644 --- a/onnxruntime/core/providers/migraphx/hip_allocator.cc +++ b/onnxruntime/core/providers/migraphx/migraphx_allocator.cc @@ -3,7 +3,7 @@ #include "core/providers/shared_library/provider_api.h" #include "migraphx_call.h" -#include "hip_allocator.h" +#include "migraphx_allocator.h" #include "core/common/status.h" #include "core/framework/float16.h" #include "core/common/status.h" @@ -11,7 +11,7 @@ namespace onnxruntime { -void HIPAllocator::CheckDevice() const { +void MIGraphXAllocator::CheckDevice() const { #ifndef NDEBUG // check device to match at debug build // if it's expected to change, call hipSetDevice instead of the check @@ -23,7 +23,7 @@ void HIPAllocator::CheckDevice() const { #endif } -void* HIPAllocator::Alloc(size_t size) { +void* MIGraphXAllocator::Alloc(size_t size) { CheckDevice(); void* p = nullptr; if (size > 0) { @@ -32,12 +32,12 @@ void* HIPAllocator::Alloc(size_t size) { return p; } -void HIPAllocator::Free(void* p) { +void MIGraphXAllocator::Free(void* p) { CheckDevice(); (void)hipFree(p); // do not throw error since it's OK for hipFree to fail during shutdown } -void* HIPExternalAllocator::Alloc(size_t size) { +void* MIGraphXExternalAllocator::Alloc(size_t size) { void* p = nullptr; if (size > 0) { p = alloc_(size); @@ -49,7 +49,7 @@ void* HIPExternalAllocator::Alloc(size_t size) { return p; } -void HIPExternalAllocator::Free(void* p) { +void MIGraphXExternalAllocator::Free(void* p) { free_(p); std::lock_guard lock(lock_); auto it = reserved_.find(p); @@ -59,7 +59,7 @@ void HIPExternalAllocator::Free(void* p) { } } -void* HIPExternalAllocator::Reserve(size_t size) { +void* MIGraphXExternalAllocator::Reserve(size_t size) { void* p = Alloc(size); if (!p) return nullptr; std::lock_guard lock(lock_); diff --git a/onnxruntime/core/providers/migraphx/hip_allocator.h b/onnxruntime/core/providers/migraphx/migraphx_allocator.h similarity index 78% rename from onnxruntime/core/providers/migraphx/hip_allocator.h rename to onnxruntime/core/providers/migraphx/migraphx_allocator.h index 3244f9f04ea70..64da844e8c714 100644 --- a/onnxruntime/core/providers/migraphx/hip_allocator.h +++ b/onnxruntime/core/providers/migraphx/migraphx_allocator.h @@ -9,12 +9,12 @@ namespace onnxruntime { -class HIPAllocator : public IAllocator { +class MIGraphXAllocator : public IAllocator { public: - HIPAllocator(int device_id, const char* name) + MIGraphXAllocator(int device_id, const char* name) : IAllocator( OrtMemoryInfo(name, OrtAllocatorType::OrtDeviceAllocator, - OrtDevice(OrtDevice::GPU, OrtDevice::MemType::DEFAULT, device_id), + OrtDevice(OrtDevice::GPU, OrtDevice::MemType::DEFAULT, static_cast(device_id)), device_id, OrtMemTypeDefault)) {} virtual void* Alloc(size_t size) override; @@ -24,14 +24,14 @@ class HIPAllocator : public IAllocator { void CheckDevice() const; }; -class HIPExternalAllocator : public HIPAllocator { +class MIGraphXExternalAllocator : public MIGraphXAllocator { typedef void* (*ExternalAlloc)(size_t size); typedef void (*ExternalFree)(void* p); typedef void (*ExternalEmptyCache)(); public: - HIPExternalAllocator(OrtDevice::DeviceId device_id, const char* name, void* alloc, void* free, void* empty_cache) - : HIPAllocator(device_id, name) { + MIGraphXExternalAllocator(OrtDevice::DeviceId device_id, const char* name, void* alloc, void* free, void* empty_cache) + : MIGraphXAllocator(device_id, name) { alloc_ = reinterpret_cast(alloc); free_ = reinterpret_cast(free); empty_cache_ = reinterpret_cast(empty_cache); @@ -55,7 +55,7 @@ class HIPPinnedAllocator : public IAllocator { HIPPinnedAllocator(int device_id, const char* name) : IAllocator( OrtMemoryInfo(name, OrtAllocatorType::OrtDeviceAllocator, - OrtDevice(OrtDevice::CPU, OrtDevice::MemType::HIP_PINNED, device_id), + OrtDevice(OrtDevice::CPU, OrtDevice::MemType::HIP_PINNED, static_cast(device_id)), device_id, OrtMemTypeCPUOutput)) {} virtual void* Alloc(size_t size) override; diff --git a/onnxruntime/core/providers/migraphx/migraphx_call.cc b/onnxruntime/core/providers/migraphx/migraphx_call.cc index 5248ac2f39214..9807cd646e51c 100644 --- a/onnxruntime/core/providers/migraphx/migraphx_call.cc +++ b/onnxruntime/core/providers/migraphx/migraphx_call.cc @@ -1,10 +1,13 @@ // Copyright (c) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. +#ifdef _WIN32 +#include +#else #include -#include -#include -#include +#endif + +#include #include "core/common/common.h" #include "core/common/status.h" #include "core/providers/shared_library/provider_api.h" @@ -34,16 +37,20 @@ std::conditional_t RocmCall( ERRTYPE retCode, const char* exprString, const char* libName, ERRTYPE successCode, const char* msg, const char* file, const int line) { if (retCode != successCode) { try { - char hostname[HOST_NAME_MAX]; - if (gethostname(hostname, HOST_NAME_MAX) != 0) - strcpy(hostname, "?"); +#ifdef _WIN32 + // According to the POSIX spec, 255 is the safe minimum value. + static constexpr int HOST_NAME_MAX = 255; +#endif + std::string hostname(HOST_NAME_MAX, 0); + if (gethostname(hostname.data(), HOST_NAME_MAX) != 0) + hostname = "?"; int currentHipDevice; (void)hipGetDevice(¤tHipDevice); (void)hipGetLastError(); // clear last HIP error static char str[1024]; snprintf(str, 1024, "%s failure %d: %s ; GPU=%d ; hostname=%s ; file=%s ; line=%d ; expr=%s; %s", libName, (int)retCode, RocmErrString(retCode), currentHipDevice, - hostname, + hostname.c_str(), file, line, exprString, msg); if constexpr (THRW) { // throw an exception with the error info @@ -68,9 +75,5 @@ std::conditional_t RocmCall( template Status RocmCall(hipError_t retCode, const char* exprString, const char* libName, hipError_t successCode, const char* msg, const char* file, const int line); template void RocmCall(hipError_t retCode, const char* exprString, const char* libName, hipError_t successCode, const char* msg, const char* file, const int line); -template Status RocmCall(rocblas_status retCode, const char* exprString, const char* libName, rocblas_status successCode, const char* msg, const char* file, const int line); -template void RocmCall(rocblas_status retCode, const char* exprString, const char* libName, rocblas_status successCode, const char* msg, const char* file, const int line); -template Status RocmCall(miopenStatus_t retCode, const char* exprString, const char* libName, miopenStatus_t successCode, const char* msg, const char* file, const int line); -template void RocmCall(miopenStatus_t retCode, const char* exprString, const char* libName, miopenStatus_t successCode, const char* msg, const char* file, const int line); } // namespace onnxruntime diff --git a/onnxruntime/core/providers/migraphx/migraphx_call.h b/onnxruntime/core/providers/migraphx/migraphx_call.h index 15d385a636b76..f6a95cebf34b5 100644 --- a/onnxruntime/core/providers/migraphx/migraphx_call.h +++ b/onnxruntime/core/providers/migraphx/migraphx_call.h @@ -4,8 +4,6 @@ #pragma once #include "migraphx_inc.h" -#pragma once - namespace onnxruntime { // ----------------------------------------------------------------------- diff --git a/onnxruntime/core/providers/migraphx/migraphx_execution_provider.cc b/onnxruntime/core/providers/migraphx/migraphx_execution_provider.cc index 6ee85c3a4c047..097b16ecde536 100644 --- a/onnxruntime/core/providers/migraphx/migraphx_execution_provider.cc +++ b/onnxruntime/core/providers/migraphx/migraphx_execution_provider.cc @@ -13,12 +13,11 @@ #include "core/common/logging/severity.h" #include "migraphx_execution_provider.h" #include "migraphx_execution_provider_utils.h" -#include "hip_allocator.h" +#include "migraphx_allocator.h" #include "gpu_data_transfer.h" #include "migraphx_inc.h" -// TODO: find a better way to share this -#include "core/providers/rocm/rocm_stream_handle.h" +#include "migraphx_stream_handle.h" #if defined(_MSC_VER) #pragma warning(disable : 4244 4245) @@ -102,10 +101,10 @@ std::shared_ptr MIGraphXExecutionProvider::GetKernelRegistry() c } MIGraphXExecutionProvider::MIGraphXExecutionProvider(const MIGraphXExecutionProviderInfo& info) - : IExecutionProvider{onnxruntime::kMIGraphXExecutionProvider, OrtDevice(OrtDevice::GPU, OrtDevice::MemType::DEFAULT, info.device_id)}, device_id_(info.device_id) { + : IExecutionProvider{onnxruntime::kMIGraphXExecutionProvider, OrtDevice(OrtDevice::GPU, OrtDevice::MemType::DEFAULT, info.device_id)}, info_(info) { InitProviderOrtApi(); // Set GPU device to be used - HIP_CALL_THROW(hipSetDevice(device_id_)); + HIP_CALL_THROW(hipSetDevice(info_.device_id)); t_ = migraphx::target(info.target_device.c_str()); // whether fp16 is enable @@ -181,16 +180,10 @@ MIGraphXExecutionProvider::MIGraphXExecutionProvider(const MIGraphXExecutionProv dump_model_ops_ = (std::stoi(dump_model_ops_env) == 0 ? false : true); } - ROCBLAS_CALL_THROW(rocblas_create_handle(&external_rocblas_handle_)); - ROCBLAS_CALL_THROW(rocblas_set_stream(external_rocblas_handle_, stream_)); - - MIOPEN_CALL_THROW(miopenCreate(&external_miopen_handle_)); - MIOPEN_CALL_THROW(miopenSetStream(external_miopen_handle_, stream_)); - metadef_id_generator_ = ModelMetadefIdGenerator::Create(); LOGS_DEFAULT(VERBOSE) << "[MIGraphX EP] MIGraphX provider options: " - << "device_id: " << device_id_ + << "device_id: " << info_.device_id << ", migraphx_fp16_enable: " << fp16_enable_ << ", migraphx_int8_enable: " << int8_enable_ << ", migraphx_int8_enable: " << int8_enable_ @@ -205,17 +198,14 @@ MIGraphXExecutionProvider::MIGraphXExecutionProvider(const MIGraphXExecutionProv } MIGraphXExecutionProvider::~MIGraphXExecutionProvider() { - ORT_IGNORE_RETURN_VALUE(ROCBLAS_CALL(rocblas_destroy_handle(external_rocblas_handle_))); - ORT_IGNORE_RETURN_VALUE(MIOPEN_CALL(miopenDestroy(external_miopen_handle_))); } std::vector MIGraphXExecutionProvider::CreatePreferredAllocators() { AllocatorCreationInfo default_memory_info( - [](OrtDevice::DeviceId device_id) { return CreateROCMAllocator(device_id, onnxruntime::CUDA); }, device_id_); + [](OrtDevice::DeviceId device_id) { return CreateMIGraphXAllocator(device_id, onnxruntime::CUDA); }, info_.device_id); AllocatorCreationInfo pinned_allocator_info( [](OrtDevice::DeviceId device_id) { - ORT_UNUSED_PARAMETER(device_id); - return CreateROCMPinnedAllocator(onnxruntime::CUDA_PINNED); + return CreateMIGraphXPinnedAllocator(device_id, onnxruntime::CUDA_PINNED); }, 0); return std::vector{CreateAllocator(default_memory_info), CreateAllocator(pinned_allocator_info)}; @@ -254,40 +244,40 @@ static bool getMIGraphXType(ONNXTensorElementDataType type, migraphx_shape_datatype_t& mgx_type) { mgx_type = migraphx_shape_float_type; switch (type) { - case ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_FLOAT16: + case ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16: mgx_type = migraphx_shape_half_type; break; - case ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_FLOAT: + case ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT: mgx_type = migraphx_shape_float_type; break; - case ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_DOUBLE: + case ONNX_TENSOR_ELEMENT_DATA_TYPE_DOUBLE: mgx_type = migraphx_shape_double_type; break; - case ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_INT8: + case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT8: mgx_type = migraphx_shape_int8_type; break; - case ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_INT16: + case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT16: mgx_type = migraphx_shape_int16_type; break; - case ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_INT32: + case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT32: mgx_type = migraphx_shape_int32_type; break; - case ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_INT64: + case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64: mgx_type = migraphx_shape_int64_type; break; - case ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_UINT8: + case ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT8: mgx_type = migraphx_shape_uint8_type; break; - case ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_UINT16: + case ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT16: mgx_type = migraphx_shape_uint16_type; break; - case ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_UINT32: + case ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT32: mgx_type = migraphx_shape_uint32_type; break; - case ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_UINT64: + case ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT64: mgx_type = migraphx_shape_uint64_type; break; - case ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_BOOL: + case ONNX_TENSOR_ELEMENT_DATA_TYPE_BOOL: mgx_type = migraphx_shape_bool_type; break; default: @@ -303,7 +293,7 @@ std::vector toVector(const ONNX_NAMESPACE::int64s& nums) { std::vector result; int num = nums.size(); for (int i = 0; i < num; ++i) { - result.push_back(nums[i]); + result.push_back(static_cast(nums[i])); } return result; @@ -501,16 +491,9 @@ static bool IsUnsupportedOpMode(const onnxruntime::GraphViewer& graph_viewer, co if (arg_s != nullptr) { const auto& tensor_dims = arg_s->dim(); std::vector dims; - std::transform(tensor_dims.begin(), - tensor_dims.end(), - std::back_inserter(dims), - [&](auto&& d) -> std::size_t { - if (d.has_dim_value()) { - return d.dim_value(); - } else { - return 0; - } - }); + for (auto&& dim : tensor_dims) { + dims.emplace_back(dim.has_dim_value() ? dim.dim_value() : 0); + } if (dims == std::vector{0}) { return true; } @@ -546,8 +529,8 @@ static bool IsUnsupportedOpMode(const onnxruntime::GraphViewer& graph_viewer, co } void SubgraphPostProcessing(const onnxruntime::GraphViewer& graph_viewer, std::vector>& clusters, - const logging::Logger& logger) { - // Then check whether a subgraph should fallback to CPU + [[maybe_unused]] const logging::Logger& logger) { + // Then check whether a subgraph should fall back to CPU // 1. Check whether a subgraph contains a RNN operator std::unordered_set rnn_names = {"RNN", "GRU", "LSTM"}; std::unordered_set op_names = {"AveragePool", "Conv", "Gemm", "LRN", "MatMul", "MaxPool"}; @@ -591,17 +574,10 @@ void SubgraphPostProcessing(const onnxruntime::GraphViewer& graph_viewer, std::v if (arg_s == nullptr) return false; const auto& tensor_dims = arg_s->dim(); std::vector dims; - std::transform(tensor_dims.begin(), - tensor_dims.end(), - std::back_inserter(dims), - [&](auto&& d) -> std::size_t { - if (d.has_dim_value()) { - return d.dim_value(); - } else { - return 1; - } - }); - return (std::accumulate(dims.begin(), dims.end(), 1, std::multiplies{}) > 300); + for (auto&& dim : tensor_dims) { + dims.emplace_back(dim.has_dim_value() ? dim.dim_value() : 1); + } + return (std::accumulate(dims.begin(), dims.end(), 1ULL, std::multiplies{}) > 300); })) { return false; } @@ -623,7 +599,7 @@ void SubgraphPostProcessing(const onnxruntime::GraphViewer& graph_viewer, std::v static bool IsNodeSupported(const std::set& op_set, const onnxruntime::GraphViewer& graph_viewer, const NodeIndex node_idx, - const logging::Logger& logger) { + [[maybe_unused]] const logging::Logger& logger) { const auto& node = graph_viewer.GetNode(node_idx); const auto& optype = node->OpType(); const auto& domain = node->Domain(); @@ -1442,14 +1418,10 @@ Status MIGraphXExecutionProvider::Compile(const std::vector& // lock to avoid race condition std::lock_guard lock(*(mgx_state->mgx_mu_ptr)); -#ifdef MIGRAPHX_STREAM_SYNC void* rocm_stream; Ort::ThrowOnError(api->KernelContext_GetGPUComputeStream(context, &rocm_stream)); auto prog_outputs = prog.run_async(m, static_cast(rocm_stream)); -#else - auto prog_outputs = prog.eval(m); - HIP_CALL_THROW(hipDeviceSynchronize()); -#endif + // In case of input parameters are reused as output parameter call hipMemcpy auto output_num = prog_outputs.size(); if (prog_output_indices.size() < output_num) { @@ -1478,8 +1450,7 @@ Status MIGraphXExecutionProvider::Compile(const std::vector& void MIGraphXExecutionProvider::RegisterStreamHandlers(IStreamCommandHandleRegistry& stream_handle_registry, AllocatorMap& allocators) const { auto allocator = allocators[GetOrtDeviceByMemType(OrtMemTypeCPU)]; - RegisterRocmStreamHandles(stream_handle_registry, OrtDevice::GPU, allocator, true, stream_, - false /*TODO:external_stream_*/, external_miopen_handle_, external_rocblas_handle_); + RegisterMIGraphXStreamHandles(stream_handle_registry, OrtDevice::GPU, allocator, true, stream_, false /*TODO:external_stream_*/); } OrtDevice MIGraphXExecutionProvider::GetOrtDeviceByMemType(OrtMemType mem_type) const { @@ -1487,7 +1458,6 @@ OrtDevice MIGraphXExecutionProvider::GetOrtDeviceByMemType(OrtMemType mem_type) if (mem_type == OrtMemTypeCPUOutput) return OrtDevice(OrtDevice::CPU, OrtDevice::MemType::HIP_PINNED, 0 /*CPU device id always be 0*/); return default_device_; } -#ifdef MIGRAPHX_STREAM_SYNC Status MIGraphXExecutionProvider::Sync() const { HIP_CALL_THROW(hipStreamSynchronize(static_cast(nullptr))); @@ -1512,5 +1482,4 @@ Status MIGraphXExecutionProvider::OnRunEnd(bool /*sync_stream*/, const onnxrunti return Status::OK(); } -#endif } // namespace onnxruntime diff --git a/onnxruntime/core/providers/migraphx/migraphx_execution_provider.h b/onnxruntime/core/providers/migraphx/migraphx_execution_provider.h index 1977f71b8b1cf..f34ca320d0a5a 100644 --- a/onnxruntime/core/providers/migraphx/migraphx_execution_provider.h +++ b/onnxruntime/core/providers/migraphx/migraphx_execution_provider.h @@ -3,9 +3,6 @@ #pragma once -#include -#include - #include "core/framework/arena_extend_strategy.h" #include "core/framework/execution_provider.h" #include "core/platform/ort_mutex.h" @@ -14,8 +11,6 @@ #include #include -// TODO: find a better way to share this -// #include "core/providers/cuda/rocm_stream_handle.h" namespace onnxruntime { @@ -62,13 +57,11 @@ class MIGraphXExecutionProvider : public IExecutionProvider { explicit MIGraphXExecutionProvider(const MIGraphXExecutionProviderInfo& info); ~MIGraphXExecutionProvider(); -#ifdef MIGRAPHX_STREAM_SYNC Status Sync() const override; Status OnRunStart(const onnxruntime::RunOptions& run_options) override; Status OnRunEnd(bool sync_stream, const onnxruntime::RunOptions& run_options) override; -#endif std::vector> GetCapability(const onnxruntime::GraphViewer& graph_viewer, @@ -85,7 +78,13 @@ class MIGraphXExecutionProvider : public IExecutionProvider { OrtDevice GetOrtDeviceByMemType(OrtMemType mem_type) const override; std::vector CreatePreferredAllocators() override; + int GetDeviceId() const override { return info_.device_id; } + ProviderOptions GetProviderOptions() const override { + return MIGraphXExecutionProviderInfo::ToProviderOptions(info_); + } + private: + MIGraphXExecutionProviderInfo info_; bool fp16_enable_ = false; bool int8_enable_ = false; std::string int8_calibration_cache_name_; @@ -98,7 +97,6 @@ class MIGraphXExecutionProvider : public IExecutionProvider { bool load_compiled_model_ = false; std::string load_compiled_path_; bool dump_model_ops_ = false; - int device_id_; migraphx::target t_; OrtMutex mgx_mu_; hipStream_t stream_ = nullptr; @@ -109,8 +107,6 @@ class MIGraphXExecutionProvider : public IExecutionProvider { std::unordered_map map_no_input_shape_; AllocatorPtr allocator_; - miopenHandle_t external_miopen_handle_ = nullptr; - rocblas_handle external_rocblas_handle_ = nullptr; std::unique_ptr metadef_id_generator_; }; diff --git a/onnxruntime/core/providers/migraphx/migraphx_execution_provider_info.h b/onnxruntime/core/providers/migraphx/migraphx_execution_provider_info.h index 8411e3eef096b..68d5d9af98ea4 100644 --- a/onnxruntime/core/providers/migraphx/migraphx_execution_provider_info.h +++ b/onnxruntime/core/providers/migraphx/migraphx_execution_provider_info.h @@ -14,7 +14,7 @@ namespace onnxruntime { // Information needed to construct trt execution providers. struct MIGraphXExecutionProviderInfo { std::string target_device; - int device_id{0}; + OrtDevice::DeviceId device_id{0}; bool fp16_enable{false}; bool int8_enable{false}; std::string int8_calibration_table_name{""}; diff --git a/onnxruntime/core/providers/migraphx/migraphx_execution_provider_utils.h b/onnxruntime/core/providers/migraphx/migraphx_execution_provider_utils.h index 071070e92a209..9274b5696185c 100644 --- a/onnxruntime/core/providers/migraphx/migraphx_execution_provider_utils.h +++ b/onnxruntime/core/providers/migraphx/migraphx_execution_provider_utils.h @@ -28,7 +28,7 @@ bool IsGraphInput(const GraphViewer& graph, const std::string& name) { return (std::find(input_names.begin(), input_names.end(), name) != input_names.end()); } -bool IsGraphInitializer(const GraphViewer& graph, const std::string& name, bool check_outer_scope = true) { +bool IsGraphInitializer(const GraphViewer& graph, const std::string& name, [[maybe_unused]] bool check_outer_scope = true) { const ONNX_NAMESPACE::TensorProto* initializer = nullptr; return graph.GetInitializedTensor(name, initializer); } diff --git a/onnxruntime/core/providers/migraphx/migraphx_inc.h b/onnxruntime/core/providers/migraphx/migraphx_inc.h index 96b24051ace76..2b035b20f619f 100644 --- a/onnxruntime/core/providers/migraphx/migraphx_inc.h +++ b/onnxruntime/core/providers/migraphx/migraphx_inc.h @@ -4,5 +4,5 @@ #pragma once #include -#include +#include #include diff --git a/onnxruntime/core/providers/migraphx/migraphx_provider_factory.cc b/onnxruntime/core/providers/migraphx/migraphx_provider_factory.cc index dd24dbdc76d2f..6d199930116e8 100644 --- a/onnxruntime/core/providers/migraphx/migraphx_provider_factory.cc +++ b/onnxruntime/core/providers/migraphx/migraphx_provider_factory.cc @@ -6,7 +6,7 @@ #include "core/providers/migraphx/migraphx_provider_factory.h" #include "migraphx_execution_provider.h" #include "migraphx_provider_factory_creator.h" -#include "hip_allocator.h" +#include "migraphx_allocator.h" #include "gpu_data_transfer.h" #include "core/framework/provider_options.h" @@ -33,10 +33,23 @@ std::unique_ptr MIGraphXProviderFactory::CreateProvider() { return std::make_unique(info_); } +struct ProviderInfo_MIGraphX_Impl final : ProviderInfo_MIGraphX { + std::unique_ptr CreateMIGraphXAllocator(int16_t device_id, const char* name) override { + return std::make_unique(device_id, name); + } + + std::unique_ptr CreateMIGraphXPinnedAllocator(int16_t device_id, const char* name) override { + return std::make_unique(device_id, name); + } + +} g_info; + struct MIGraphX_Provider : Provider { + void* GetInfo() override { return &g_info; } + std::shared_ptr CreateExecutionProviderFactory(int device_id) override { MIGraphXExecutionProviderInfo info; - info.device_id = device_id; + info.device_id = static_cast(device_id); info.target_device = "gpu"; return std::make_shared(info); } @@ -44,7 +57,7 @@ struct MIGraphX_Provider : Provider { std::shared_ptr CreateExecutionProviderFactory(const void* provider_options) override { auto& options = *reinterpret_cast(provider_options); MIGraphXExecutionProviderInfo info; - info.device_id = options.device_id; + info.device_id = static_cast(options.device_id); info.target_device = "gpu"; info.fp16_enable = options.migraphx_fp16_enable; info.int8_enable = options.migraphx_int8_enable; diff --git a/onnxruntime/core/providers/migraphx/migraphx_provider_factory.h b/onnxruntime/core/providers/migraphx/migraphx_provider_factory.h index ac9834e64942a..b257a4318dc0e 100644 --- a/onnxruntime/core/providers/migraphx/migraphx_provider_factory.h +++ b/onnxruntime/core/providers/migraphx/migraphx_provider_factory.h @@ -10,4 +10,13 @@ struct IExecutionProviderFactory; struct MIGraphXExecutionProviderInfo; enum class ArenaExtendStrategy : int32_t; struct MIGraphXExecutionProviderExternalAllocatorInfo; + +struct ProviderInfo_MIGraphX { + virtual std::unique_ptr CreateMIGraphXAllocator(int16_t device_id, const char* name) = 0; + virtual std::unique_ptr CreateMIGraphXPinnedAllocator(int16_t device_id, const char* name) = 0; + + protected: + ~ProviderInfo_MIGraphX() = default; // Can only be destroyed through a subclass instance +}; + } // namespace onnxruntime diff --git a/onnxruntime/core/providers/migraphx/migraphx_stream_handle.cc b/onnxruntime/core/providers/migraphx/migraphx_stream_handle.cc new file mode 100644 index 0000000000000..9c5bb4ecf5c97 --- /dev/null +++ b/onnxruntime/core/providers/migraphx/migraphx_stream_handle.cc @@ -0,0 +1,171 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include +#include "migraphx_stream_handle.h" + +namespace onnxruntime { + +struct MIGraphXNotification : public synchronize::Notification { + MIGraphXNotification(Stream& s) : Notification(s) { + HIP_CALL_THROW(hipEventCreateWithFlags(&event_, hipEventDisableTiming)); + } + + ~MIGraphXNotification() { + if (event_) + HIP_CALL_THROW(hipEventDestroy(event_)); + } + + void Activate() override { + // record event with hipEventBlockingSync so we can support sync on host without busy wait. + HIP_CALL_THROW(hipEventRecord(event_, static_cast(stream_.GetHandle()))); + } + + void wait_on_device(Stream& device_stream) { + ORT_ENFORCE(device_stream.GetDevice().Type() == OrtDevice::GPU, "Unexpected device:", device_stream.GetDevice().ToString()); + // launch a wait command to the migraphx stream + HIP_CALL_THROW(hipStreamWaitEvent(static_cast(device_stream.GetHandle()), event_, 0)); + }; + + void wait_on_host() { + // CUDA_CALL_THROW(cudaStreamSynchronize(stream_)); + HIP_CALL_THROW(hipEventSynchronize(event_)); + } + + hipEvent_t event_; +}; + +MIGraphXStream::MIGraphXStream(hipStream_t stream, + const OrtDevice& device, + AllocatorPtr cpu_allocator, + bool release_cpu_buffer_on_migraphx_stream) + : Stream(stream, device), + cpu_allocator_(cpu_allocator), + release_cpu_buffer_on_migraphx_stream_(release_cpu_buffer_on_migraphx_stream) { +} + +MIGraphXStream::~MIGraphXStream() { + ORT_IGNORE_RETURN_VALUE(CleanUpOnRunEnd()); + if (own_stream_) { + auto* handle = GetHandle(); + if (handle) + HIP_CALL_THROW(hipStreamDestroy(static_cast(handle))); + } +} + +std::unique_ptr MIGraphXStream::CreateNotification(size_t /*num_consumers*/) { + return std::make_unique(*this); +} + +void MIGraphXStream::Flush() { + if (own_stream_) + HIP_CALL_THROW(hipStreamSynchronize(static_cast(GetHandle()))); +} + +void MIGraphXStream::EnqueDeferredCPUBuffer(void* cpu_buffer) { + // stream is per thread, so don't need lock + deferred_cpu_buffers_.push_back(cpu_buffer); +} + +struct CpuBuffersInfo { + // This struct stores the information needed + // to release CPU buffers allocated for GPU kernels. + // It's used to enqueue their release after + // associated GPU kernels in a MIGraphX stream. + + // This is a CPU allocator in MIGraphX EP. + // It must be the one used to allocate the + // following pointers. + AllocatorPtr allocator; + // buffers[i] is the i-th pointer added by + // AddDeferredReleaseCPUPtr for a specific + // MIGraphX stream. For example, this fields + // should contain all values in + // deferred_release_buffer_pool_[my_stream] + // when release my_stream's buffers. + std::unique_ptr buffers; + // CPU buffer buffers[i]. + // Number of buffer points in "buffers". + size_t n_buffers; +}; + +static void ReleaseCpuBufferCallback(void* raw_info) { + std::unique_ptr info = std::make_unique(); + info.reset(reinterpret_cast(raw_info)); + for (size_t i = 0; i < info->n_buffers; ++i) { + info->allocator->Free(info->buffers[i]); + } +} + +Status MIGraphXStream::CleanUpOnRunEnd() { + if (deferred_cpu_buffers_.empty()) + return Status::OK(); + // Release the ownership of cpu_buffers_info so that the underlying + // object will keep alive until the end of ReleaseCpuBufferCallback. + if (release_cpu_buffer_on_migraphx_stream_ && cpu_allocator_->Info().alloc_type == OrtArenaAllocator) { + std::unique_ptr cpu_buffers_info = std::make_unique(); + cpu_buffers_info->allocator = cpu_allocator_; + cpu_buffers_info->buffers = std::make_unique(deferred_cpu_buffers_.size()); + for (size_t i = 0; i < deferred_cpu_buffers_.size(); ++i) { + cpu_buffers_info->buffers[i] = deferred_cpu_buffers_.at(i); + } + cpu_buffers_info->n_buffers = deferred_cpu_buffers_.size(); + HIP_RETURN_IF_ERROR(hipLaunchHostFunc(static_cast(GetHandle()), ReleaseCpuBufferCallback, cpu_buffers_info.release())); + } else { + HIP_RETURN_IF_ERROR(hipStreamSynchronize(static_cast(GetHandle()))); + for (auto* buffer : deferred_cpu_buffers_) { + cpu_allocator_->Free(buffer); + } + } + + deferred_cpu_buffers_.clear(); + return Status::OK(); +} + +void* MIGraphXStream::GetResource(int version, int id) const { + ORT_ENFORCE(version <= ORT_ROCM_RESOUCE_VERSION, "resource version unsupported!"); + void* resource{}; + switch (id) { + case RocmResource::hip_stream_t: + return reinterpret_cast(GetHandle()); + default: + break; + } + return resource; +} + +// CPU Stream command handles +void WaitMIGraphXNotificationOnDevice(Stream& stream, synchronize::Notification& notification) { + static_cast(¬ification)->wait_on_device(stream); +} + +void WaitMIGraphXNotificationOnHost(Stream& /*stream*/, synchronize::Notification& notification) { + static_cast(¬ification)->wait_on_host(); +} + +void RegisterMIGraphXStreamHandles(IStreamCommandHandleRegistry& stream_handle_registry, + const OrtDevice::DeviceType device_type, + AllocatorPtr cpu_allocator, + bool release_cpu_buffer_on_migraphx_stream, + hipStream_t external_stream, + bool use_existing_stream) { + // wait migraphx notification on migraphx ep + stream_handle_registry.RegisterWaitFn(device_type, device_type, WaitMIGraphXNotificationOnDevice); + // wait migraphx notification on cpu ep + stream_handle_registry.RegisterWaitFn(device_type, OrtDevice::CPU, WaitMIGraphXNotificationOnHost); + if (!use_existing_stream) + stream_handle_registry.RegisterCreateStreamFn(device_type, [cpu_allocator, release_cpu_buffer_on_migraphx_stream](const OrtDevice& device) { + HIP_CALL_THROW(hipSetDevice(device.Id())); + hipStream_t stream = nullptr; + HIP_CALL_THROW(hipStreamCreateWithFlags(&stream, hipStreamNonBlocking)); + return std::make_unique(stream, device, cpu_allocator, release_cpu_buffer_on_migraphx_stream); + }); + else + stream_handle_registry.RegisterCreateStreamFn(device_type, [cpu_allocator, + release_cpu_buffer_on_migraphx_stream, + external_stream](const OrtDevice& device) { + return std::make_unique(external_stream, device, cpu_allocator, release_cpu_buffer_on_migraphx_stream); + }); +} + +} // namespace onnxruntime diff --git a/onnxruntime/core/providers/migraphx/migraphx_stream_handle.h b/onnxruntime/core/providers/migraphx/migraphx_stream_handle.h new file mode 100644 index 0000000000000..03a7c1607e3ad --- /dev/null +++ b/onnxruntime/core/providers/migraphx/migraphx_stream_handle.h @@ -0,0 +1,48 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#pragma once +#include "core/framework/stream_handles.h" +#include "migraphx_inc.h" +#include "migraphx_call.h" + +#define HIP_RETURN_IF_ERROR(expr) ORT_RETURN_IF_ERROR(HIP_CALL(expr)) + +namespace onnxruntime { +void WaitMIGraphXNotificationOnDevice(Stream& stream, synchronize::Notification& notification); + +struct MIGraphXStream : Stream { + MIGraphXStream(hipStream_t stream, + const OrtDevice& device, + AllocatorPtr cpu_allocator, + bool release_cpu_buffer_on_migraphx_stream); + + ~MIGraphXStream(); + + std::unique_ptr CreateNotification(size_t /*num_consumers*/) override; + + void Flush() override; + + Status CleanUpOnRunEnd() override; + + void EnqueDeferredCPUBuffer(void* cpu_buffer); + + bool own_stream_{true}; + + virtual void* GetResource(int version, int id) const; + + virtual WaitNotificationFn GetWaitNotificationFn() const { return WaitMIGraphXNotificationOnDevice; } + + private: + std::vector deferred_cpu_buffers_; + AllocatorPtr cpu_allocator_; + bool release_cpu_buffer_on_migraphx_stream_{true}; +}; + +void RegisterMIGraphXStreamHandles(IStreamCommandHandleRegistry& stream_handle_registry, + const OrtDevice::DeviceType device_type, + AllocatorPtr cpu_allocator, + bool release_cpu_buffer_on_migraphx_stream, + hipStream_t external_stream, + bool use_existing_stream); +} // namespace onnxruntime diff --git a/onnxruntime/core/providers/shared_library/provider_api.h b/onnxruntime/core/providers/shared_library/provider_api.h index 590bddabdba54..2f54a04e15304 100644 --- a/onnxruntime/core/providers/shared_library/provider_api.h +++ b/onnxruntime/core/providers/shared_library/provider_api.h @@ -108,6 +108,7 @@ struct NodeProto; struct SparseTensorProto; struct StringStringEntryProto; struct StringStringEntryProtos; // RepeatedPtrField +struct OperatorSetIdProto; struct TensorProto; struct TensorProtos; // RepeatedPtrField struct TensorShapeProto_Dimension; @@ -120,6 +121,7 @@ struct TypeProto_Sequence; struct TypeProto; struct ValueInfoProto; struct ValueInfoProtos; // RepeatedPtrField +struct FunctionProto; struct InferenceContext; class GraphInferencer; using InferenceFunction = std::function; @@ -146,6 +148,7 @@ struct ConfigOptions; struct DataTransferManager; struct IndexedSubGraph; struct IndexedSubGraph_MetaDef; +enum class IndexedSubGraph_SourceOfSchema : uint8_t; struct KernelCreateInfo; struct KernelDef; struct KernelDefBuilder; @@ -279,6 +282,9 @@ std::unique_ptr CreateCPUAllocator(const OrtMemoryInfo& memory_info) std::unique_ptr CreateCUDAAllocator(int16_t device_id, const char* name); std::unique_ptr CreateCUDAPinnedAllocator(const char* name); +std::unique_ptr CreateMIGraphXAllocator(int16_t device_id, const char* name); +std::unique_ptr CreateMIGraphXPinnedAllocator(int16_t device_id, const char* name); + std::unique_ptr CreateROCMAllocator(int16_t device_id, const char* name); std::unique_ptr CreateROCMPinnedAllocator(const char* name); diff --git a/onnxruntime/core/providers/shared_library/provider_bridge_provider.cc b/onnxruntime/core/providers/shared_library/provider_bridge_provider.cc index 6e6a80f097c12..7fb9fd3c8cfd5 100644 --- a/onnxruntime/core/providers/shared_library/provider_bridge_provider.cc +++ b/onnxruntime/core/providers/shared_library/provider_bridge_provider.cc @@ -353,16 +353,12 @@ std::unique_ptr CreateGPUDataTransfer() { #endif #ifdef USE_MIGRAPHX -std::unique_ptr CreateROCMAllocator(int16_t device_id, const char* name) { - return g_host->CreateROCMAllocator(device_id, name); +std::unique_ptr CreateMIGraphXAllocator(int16_t device_id, const char* name) { + return g_host->CreateMIGraphXAllocator(device_id, name); } -std::unique_ptr CreateROCMPinnedAllocator(const char* name) { - return g_host->CreateROCMPinnedAllocator(name); -} - -std::unique_ptr CreateGPUDataTransfer() { - return g_host->CreateGPUDataTransfer(); +std::unique_ptr CreateMIGraphXPinnedAllocator(int16_t device_id, const char* name) { + return g_host->CreateMIGraphXPinnedAllocator(device_id, name); } #endif diff --git a/onnxruntime/core/providers/shared_library/provider_interfaces.h b/onnxruntime/core/providers/shared_library/provider_interfaces.h index bc6dac1a2f27f..382b3ac932520 100644 --- a/onnxruntime/core/providers/shared_library/provider_interfaces.h +++ b/onnxruntime/core/providers/shared_library/provider_interfaces.h @@ -179,6 +179,11 @@ struct ProviderHost { virtual void CudaCall_true(int retCode, const char* exprString, const char* libName, int successCode, const char* msg, const char* file, const int line) = 0; #endif +#ifdef USE_MIGRAPHX + virtual std::unique_ptr CreateMIGraphXAllocator(int16_t device_id, const char* name) = 0; + virtual std::unique_ptr CreateMIGraphXPinnedAllocator(int16_t device_id, const char* name) = 0; +#endif + #ifdef USE_ROCM virtual std::unique_ptr CreateROCMAllocator(int16_t device_id, const char* name) = 0; virtual std::unique_ptr CreateROCMPinnedAllocator(const char* name) = 0; @@ -299,6 +304,11 @@ struct ProviderHost { virtual int StringStringEntryProtos__size(ONNX_NAMESPACE::StringStringEntryProtos* p) = 0; virtual ONNX_NAMESPACE::StringStringEntryProto& StringStringEntryProtos__at(ONNX_NAMESPACE::StringStringEntryProtos* p, int index) = 0; + // OperatorSetIdProto + virtual std::string* OperatorSetIdProto__mutable_domain(ONNX_NAMESPACE::OperatorSetIdProto* p) = 0; + virtual void OperatorSetIdProto__set_version(ONNX_NAMESPACE::OperatorSetIdProto* p, int64_t version) = 0; + virtual int64_t OperatorSetIdProto__version(const ONNX_NAMESPACE::OperatorSetIdProto* p) = 0; + #if !defined(DISABLE_OPTIONAL_TYPE) // TypeProto_Optional virtual const ONNX_NAMESPACE::TypeProto& TypeProto_Optional__elem_type(const ONNX_NAMESPACE::TypeProto_Optional* p) = 0; @@ -415,6 +425,11 @@ struct ProviderHost { virtual void ModelProto__set_ir_version(ONNX_NAMESPACE::ModelProto* p, int64_t value) = 0; virtual ONNX_NAMESPACE::StringStringEntryProtos* ModelProto__mutable_metadata_props(ONNX_NAMESPACE::ModelProto* p) = 0; + virtual const ONNX_NAMESPACE::OperatorSetIdProto& ModelProto__opset_import(const ONNX_NAMESPACE::ModelProto* p, int index) = 0; + virtual ONNX_NAMESPACE::OperatorSetIdProto* ModelProto__mutable_opset_import(ONNX_NAMESPACE::ModelProto* p, int index) = 0; + virtual int ModelProto__opset_import_size(const ONNX_NAMESPACE::ModelProto* p) = 0; + virtual ONNX_NAMESPACE::OperatorSetIdProto* ModelProto__add_opset_import(ONNX_NAMESPACE::ModelProto* p) = 0; + // NodeProto virtual std::unique_ptr NodeProto__construct() = 0; virtual void NodeProto__operator_delete(ONNX_NAMESPACE::NodeProto* p) = 0; @@ -422,6 +437,7 @@ struct ProviderHost { virtual int NodeProto__attribute_size(ONNX_NAMESPACE::NodeProto* p) = 0; virtual const ONNX_NAMESPACE::AttributeProto& NodeProto__attribute(const ONNX_NAMESPACE::NodeProto* p, int index) const = 0; virtual ONNX_NAMESPACE::AttributeProto* NodeProto__mutable_attribute(ONNX_NAMESPACE::NodeProto* p, int index) = 0; + virtual ONNX_NAMESPACE::AttributeProto* NodeProto__add_attribute(ONNX_NAMESPACE::NodeProto* p) = 0; // TensorProto virtual std::unique_ptr TensorProto__construct() = 0; @@ -490,6 +506,64 @@ struct ProviderHost { virtual const ONNX_NAMESPACE::ValueInfoProto& ValueInfoProtos__operator_array(const ONNX_NAMESPACE::ValueInfoProtos* p, int index) = 0; + // FunctionProto + virtual std::unique_ptr FunctionProto__construct() = 0; + virtual void FunctionProto__operator_delete(ONNX_NAMESPACE::FunctionProto* p) = 0; + + virtual bool FunctionProto__SerializeToString(const ONNX_NAMESPACE::FunctionProto* p, std::string& string) = 0; + virtual bool FunctionProto__SerializeToOstream(const ONNX_NAMESPACE::FunctionProto* p, std::ostream& output) = 0; + virtual bool FunctionProto__ParseFromString(ONNX_NAMESPACE::FunctionProto* p, const std::string& data) = 0; + virtual std::string FunctionProto__SerializeAsString(const ONNX_NAMESPACE::FunctionProto* p) = 0; + + virtual bool FunctionProto__has_name(const ONNX_NAMESPACE::FunctionProto* p) = 0; + virtual const std::string& FunctionProto__name(const ONNX_NAMESPACE::FunctionProto* p) const = 0; + virtual void FunctionProto__set_name(ONNX_NAMESPACE::FunctionProto* p, const ::std::string& name) = 0; + + virtual bool FunctionProto__has_doc_string(const ONNX_NAMESPACE::FunctionProto* p) = 0; + virtual const std::string& FunctionProto__doc_string(const ONNX_NAMESPACE::FunctionProto* p) const = 0; + virtual void FunctionProto__set_doc_string(ONNX_NAMESPACE::FunctionProto* p, const ::std::string& doc_string) = 0; + + virtual bool FunctionProto__has_domain(const ONNX_NAMESPACE::FunctionProto* p) = 0; + virtual const std::string& FunctionProto__domain(const ONNX_NAMESPACE::FunctionProto* p) const = 0; + virtual void FunctionProto__set_domain(ONNX_NAMESPACE::FunctionProto* p, const ::std::string& domain) = 0; + + virtual const std::string& FunctionProto__input(const ONNX_NAMESPACE::FunctionProto* p, int index) = 0; + virtual std::string* FunctionProto__mutable_input(ONNX_NAMESPACE::FunctionProto* p, int index) = 0; + virtual int FunctionProto__input_size(const ONNX_NAMESPACE::FunctionProto* p) = 0; + virtual void FunctionProto__add_input(ONNX_NAMESPACE::FunctionProto* p, const std::string& value) = 0; + + virtual const std::string& FunctionProto__output(const ONNX_NAMESPACE::FunctionProto* p, int index) = 0; + virtual std::string* FunctionProto__mutable_output(ONNX_NAMESPACE::FunctionProto* p, int index) = 0; + virtual int FunctionProto__output_size(const ONNX_NAMESPACE::FunctionProto* p) = 0; + virtual void FunctionProto__add_output(ONNX_NAMESPACE::FunctionProto* p, const std::string& value) = 0; + + virtual const std::string& FunctionProto__attribute(const ONNX_NAMESPACE::FunctionProto* p, int index) = 0; + virtual std::string* FunctionProto__mutable_attribute(ONNX_NAMESPACE::FunctionProto* p, int index) = 0; + virtual int FunctionProto__attribute_size(const ONNX_NAMESPACE::FunctionProto* p) = 0; + virtual void FunctionProto__add_attribute(ONNX_NAMESPACE::FunctionProto* p, const std::string& value) = 0; + + virtual const ONNX_NAMESPACE::AttributeProto& FunctionProto__attribute_proto(const ONNX_NAMESPACE::FunctionProto* p, int index) = 0; + virtual ONNX_NAMESPACE::AttributeProto* FunctionProto__mutable_attribute_proto(ONNX_NAMESPACE::FunctionProto* p, int index) = 0; + virtual int FunctionProto__attribute_proto_size(const ONNX_NAMESPACE::FunctionProto* p) = 0; + virtual ONNX_NAMESPACE::AttributeProto* FunctionProto__add_attribute_proto(ONNX_NAMESPACE::FunctionProto* p) = 0; + + virtual const ONNX_NAMESPACE::NodeProto& FunctionProto__node(const ONNX_NAMESPACE::FunctionProto* p, int index) = 0; + virtual ONNX_NAMESPACE::NodeProto* FunctionProto__mutable_node(ONNX_NAMESPACE::FunctionProto* p, int index) = 0; + virtual int FunctionProto__node_size(const ONNX_NAMESPACE::FunctionProto* p) = 0; + virtual ONNX_NAMESPACE::NodeProto* FunctionProto__add_node(ONNX_NAMESPACE::FunctionProto* p) = 0; + + virtual const ONNX_NAMESPACE::ValueInfoProto& FunctionProto__value_info(const ONNX_NAMESPACE::FunctionProto* p, int index) = 0; + virtual ONNX_NAMESPACE::ValueInfoProtos* FunctionProto__mutable_value_info(ONNX_NAMESPACE::FunctionProto* p) = 0; + virtual ONNX_NAMESPACE::ValueInfoProto* FunctionProto__mutable_value_info(ONNX_NAMESPACE::FunctionProto* p, int index) = 0; + virtual int FunctionProto__value_info_size(const ONNX_NAMESPACE::FunctionProto* p) = 0; + virtual ONNX_NAMESPACE::ValueInfoProto* FunctionProto__add_value_info(ONNX_NAMESPACE::FunctionProto* p) = 0; + + virtual const ONNX_NAMESPACE::StringStringEntryProto& FunctionProto__metadata_props(const ONNX_NAMESPACE::FunctionProto* p, int index) = 0; + virtual ONNX_NAMESPACE::StringStringEntryProtos* FunctionProto__mutable_metadata_props(ONNX_NAMESPACE::FunctionProto* p) = 0; + virtual ONNX_NAMESPACE::StringStringEntryProto* FunctionProto__mutable_metadata_props(ONNX_NAMESPACE::FunctionProto* p, int index) = 0; + virtual int FunctionProto__metadata_props_size(const ONNX_NAMESPACE::FunctionProto* p) = 0; + virtual ONNX_NAMESPACE::StringStringEntryProto* FunctionProto__add_metadata_props(ONNX_NAMESPACE::FunctionProto* p) = 0; + virtual void RegisterSchema(const std::string& domain, const OrtCustomOp* op, int type) = 0; // ConfigOptions @@ -541,6 +615,9 @@ struct ProviderHost { virtual void IndexedSubGraph__SetMetaDef(IndexedSubGraph* p, std::unique_ptr&& meta_def_) = 0; virtual const IndexedSubGraph_MetaDef* IndexedSubGraph__GetMetaDef(const IndexedSubGraph* p) = 0; + virtual void IndexedSubGraph__SetSchemaSource(IndexedSubGraph* p, IndexedSubGraph_SourceOfSchema schema_source) = 0; + virtual IndexedSubGraph_SourceOfSchema IndexedSubGraph__GetSchemaSource(const IndexedSubGraph* p) = 0; + // KernelDef virtual void KernelDef__operator_delete(KernelDef* p) = 0; virtual int KernelDef__ExecQueueId(const KernelDef* p) = 0; diff --git a/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h b/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h index fb3b274d9b80b..de6c1da1d6430 100644 --- a/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h +++ b/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h @@ -80,6 +80,15 @@ struct StringStringEntryProtos final { PROVIDER_DISALLOW_ALL(StringStringEntryProtos) }; + +struct OperatorSetIdProto final { + std::string* mutable_domain() { return g_host->OperatorSetIdProto__mutable_domain(this); } + void set_version(int64_t version) { return g_host->OperatorSetIdProto__set_version(this, version); } + int64_t version() { return g_host->OperatorSetIdProto__version(this); } + + PROVIDER_DISALLOW_ALL(OperatorSetIdProto) +}; + struct AttributeProto final { static std::unique_ptr Create() { return g_host->AttributeProto__construct(); } void operator=(const AttributeProto& v) { g_host->AttributeProto__operator_assign(this, v); } @@ -178,6 +187,11 @@ struct ModelProto final { void set_ir_version(int64_t value) { return g_host->ModelProto__set_ir_version(this, value); } + const OperatorSetIdProto& opset_import(int index) const { return g_host->ModelProto__opset_import(this, index); } + OperatorSetIdProto* mutable_opset_import(int index) { return g_host->ModelProto__mutable_opset_import(this, index); } + int opset_import_size() const { return g_host->ModelProto__opset_import_size(this); } + OperatorSetIdProto* add_opset_import() { return g_host->ModelProto__add_opset_import(this); } + ModelProto() = delete; ModelProto(const ModelProto&) = delete; void operator=(const ModelProto&) = delete; @@ -190,6 +204,7 @@ struct NodeProto final { int attribute_size() { return g_host->NodeProto__attribute_size(this); } const AttributeProto& attribute(int index) const { return g_host->NodeProto__attribute(this, index); } AttributeProto* mutable_attribute(int index) { return g_host->NodeProto__mutable_attribute(this, index); } + AttributeProto* add_attribute() { return g_host->NodeProto__add_attribute(this); } NodeProto() = delete; NodeProto(const NodeProto&) = delete; @@ -372,6 +387,69 @@ struct ValueInfoProtos final { PROVIDER_DISALLOW_ALL(ValueInfoProtos) }; + +struct FunctionProto final { + static std::unique_ptr Create() { return g_host->FunctionProto__construct(); } + static void operator delete(void* p) { g_host->FunctionProto__operator_delete(reinterpret_cast(p)); } + + bool SerializeToString(std::string& string) const { return g_host->FunctionProto__SerializeToString(this, string); } + bool SerializeToOstream(std::ostream& output) const { return g_host->FunctionProto__SerializeToOstream(this, output); } + bool ParseFromString(const std::string& data) { return g_host->FunctionProto__ParseFromString(this, data); } + std::string SerializeAsString() const { return g_host->FunctionProto__SerializeAsString(this); } + + bool has_name() const { return g_host->FunctionProto__has_name(this); } + const std::string& name() const { return g_host->FunctionProto__name(this); } + void set_name(const std::string& name) { g_host->FunctionProto__set_name(this, name); } + + bool has_doc_string() const { return g_host->FunctionProto__has_doc_string(this); } + const std::string& doc_string() const { return g_host->FunctionProto__doc_string(this); } + void set_doc_string(const std::string& doc_string) { g_host->FunctionProto__set_doc_string(this, doc_string); } + + bool has_domain() const { return g_host->FunctionProto__has_domain(this); } + const std::string& domain() const { return g_host->FunctionProto__domain(this); } + void set_domain(const std::string& domain) { g_host->FunctionProto__set_domain(this, domain); } + + const std::string& input(int index) const { return g_host->FunctionProto__input(this, index); } + std::string* mutable_input(int index) { return g_host->FunctionProto__mutable_input(this, index); } + int input_size() const { return g_host->FunctionProto__input_size(this); } + void add_input(const std::string& value) { g_host->FunctionProto__add_input(this, value); } + + const std::string& output(int index) const { return g_host->FunctionProto__output(this, index); } + std::string* mutable_output(int index) { return g_host->FunctionProto__mutable_output(this, index); } + int output_size() const { return g_host->FunctionProto__output_size(this); } + void add_output(const std::string& value) { g_host->FunctionProto__add_output(this, value); } + + const std::string& attribute(int index) const { return g_host->FunctionProto__attribute(this, index); } + std::string* mutable_attribute(int index) { return g_host->FunctionProto__mutable_attribute(this, index); } + int attribute_size() const { return g_host->FunctionProto__attribute_size(this); } + void add_attribute(const std::string& value) { g_host->FunctionProto__add_attribute(this, value); } + + const AttributeProto& attribute_proto(int index) const { return g_host->FunctionProto__attribute_proto(this, index); } + AttributeProto* mutable_attribute_proto(int index) { return g_host->FunctionProto__mutable_attribute_proto(this, index); } + int attribute_proto_size() const { return g_host->FunctionProto__attribute_proto_size(this); } + AttributeProto* add_attribute_proto() { return g_host->FunctionProto__add_attribute_proto(this); } + + const NodeProto& node(int index) const { return g_host->FunctionProto__node(this, index); } + NodeProto* mutable_node(int index) { return g_host->FunctionProto__mutable_node(this, index); } + int node_size() const { return g_host->FunctionProto__node_size(this); } + NodeProto* add_node() { return g_host->FunctionProto__add_node(this); } + + const ValueInfoProto& value_info(int index) const { return g_host->FunctionProto__value_info(this, index); } + ValueInfoProtos* mutable_value_info() { return g_host->FunctionProto__mutable_value_info(this); } + ValueInfoProto* mutable_value_info(int index) { return g_host->FunctionProto__mutable_value_info(this, index); } + int value_info_size() const { return g_host->FunctionProto__value_info_size(this); } + ValueInfoProto* add_value_info() { return g_host->FunctionProto__add_value_info(this); } + + const StringStringEntryProto& metadata_props(int index) const { return g_host->FunctionProto__metadata_props(this, index); } + StringStringEntryProtos* mutable_metadata_props() { return g_host->FunctionProto__mutable_metadata_props(this); } + StringStringEntryProto* mutable_metadata_props(int index) { return g_host->FunctionProto__mutable_metadata_props(this, index); } + int metadata_props_size() const { return g_host->FunctionProto__metadata_props_size(this); } + StringStringEntryProto* add_metadata_props() { return g_host->FunctionProto__add_metadata_props(this); } + + FunctionProto() = delete; + FunctionProto(const FunctionProto&) = delete; + void operator=(const FunctionProto&) = delete; +}; } // namespace ONNX_NAMESPACE namespace onnxruntime { @@ -449,6 +527,12 @@ struct IndexedSubGraph_MetaDef final { void operator=(const IndexedSubGraph_MetaDef&) = delete; }; +enum class IndexedSubGraph_SourceOfSchema : uint8_t { + CREATE, + REUSE_OR_CREATE, + EXISTING, +}; + struct IndexedSubGraph final { static std::unique_ptr Create() { return g_host->IndexedSubGraph__construct(); } static void operator delete(void* p) { g_host->IndexedSubGraph__operator_delete(reinterpret_cast(p)); } @@ -458,6 +542,9 @@ struct IndexedSubGraph final { void SetMetaDef(std::unique_ptr&& meta_def_) { return g_host->IndexedSubGraph__SetMetaDef(this, std::move(*reinterpret_cast*>(&meta_def_))); } const IndexedSubGraph_MetaDef* GetMetaDef() const { return reinterpret_cast(g_host->IndexedSubGraph__GetMetaDef(this)); } + void SetSchemaSource(IndexedSubGraph_SourceOfSchema schema_source) { return g_host->IndexedSubGraph__SetSchemaSource(this, schema_source); } + IndexedSubGraph_SourceOfSchema GetSchemaSource() const { return g_host->IndexedSubGraph__GetSchemaSource(this); } + IndexedSubGraph() = delete; IndexedSubGraph(const IndexedSubGraph&) = delete; void operator=(const IndexedSubGraph&) = delete; diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc index 8a601c156bd0a..67cbc8f5d6f13 100644 --- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc +++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc @@ -70,7 +70,14 @@ bool SetDynamicRange(nvinfer1::INetworkDefinition& network, std::unordered_mapgetName(); auto dynamic_range_iter = dynamic_range_map.find(tensor_name); if (dynamic_range_iter != dynamic_range_map.end()) { +#if defined(_MSC_VER) +#pragma warning(push) +#pragma warning(disable : 4996) +#endif if (!network.getInput(i)->setDynamicRange(-dynamic_range_iter->second, dynamic_range_iter->second)) { +#if defined(_MSC_VER) +#pragma warning(pop) +#endif LOGS_DEFAULT(ERROR) << "Failed to set dynamic range for network input " << tensor_name; return false; } @@ -84,7 +91,14 @@ bool SetDynamicRange(nvinfer1::INetworkDefinition& network, std::unordered_mapgetOutput(j)->getName(); auto dynamic_range_iter = dynamic_range_map.find(tensor_name); if (dynamic_range_iter != dynamic_range_map.end()) { +#if defined(_MSC_VER) +#pragma warning(push) +#pragma warning(disable : 4996) +#endif if (!trt_layer->getOutput(j)->setDynamicRange(-dynamic_range_iter->second, dynamic_range_iter->second)) { +#if defined(_MSC_VER) +#pragma warning(pop) +#endif LOGS_DEFAULT(ERROR) << "Failed to set dynamic range for tensor " << tensor_name; return false; } @@ -122,7 +136,14 @@ bool SetDynamicRange(nvinfer1::INetworkDefinition& network, std::unordered_mapgetOutput(j)->setDynamicRange(static_cast(-max_weight), static_cast(max_weight))) { +#if defined(_MSC_VER) +#pragma warning(pop) +#endif LOGS_DEFAULT(ERROR) << "Failed to set dynamic range for layer " << const_layer_name; return false; } @@ -2232,7 +2253,14 @@ SubGraphCollection_t TensorrtExecutionProvider::GetSupportedList(SubGraphCollect auto trt_network = std::unique_ptr(trt_builder->createNetworkV2(network_flags)); auto trt_parser = tensorrt_ptr::unique_pointer(nvonnxparser::createParser(*trt_network, trt_logger)); +#if defined(_MSC_VER) +#pragma warning(push) +#pragma warning(disable : 4996) +#endif trt_parser->supportsModel(string_buf.data(), string_buf.size(), parser_nodes_list, model_path_); +#if defined(_MSC_VER) +#pragma warning(pop) +#endif SubGraphCollection_t next_nodes_list; const std::vector& subgraph_node_index = graph_viewer->GetNodesInTopologicalOrder(1 /*priority-based topological sort*/); @@ -3074,7 +3102,14 @@ Status TensorrtExecutionProvider::CreateNodeComputeInfoFromGraph(const GraphView } else { // Set INT8 per tensor dynamic range if (int8_enable_ && trt_builder->platformHasFastInt8() && int8_calibration_cache_available_) { +#if defined(_MSC_VER) +#pragma warning(push) +#pragma warning(disable : 4996) +#endif trt_config->setInt8Calibrator(nullptr); +#if defined(_MSC_VER) +#pragma warning(pop) +#endif if (!SetDynamicRange(*trt_network, dynamic_range_map)) { return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, "TensorRT EP could not set INT8 dynamic range for fused node: " + fused_node.Name()); @@ -3193,7 +3228,14 @@ Status TensorrtExecutionProvider::CreateNodeComputeInfoFromGraph(const GraphView // Note: Creating an execution context from an engine is thread safe per TRT doc // https://docs.nvidia.com/deeplearning/tensorrt/developer-guide/index.html#threading if (context_memory_sharing_enable_) { +#if defined(_MSC_VER) +#pragma warning(push) +#pragma warning(disable : 4996) +#endif size_t mem_size = trt_engine->getDeviceMemorySize(); +#if defined(_MSC_VER) +#pragma warning(pop) +#endif if (mem_size > max_ctx_mem_size_) { max_ctx_mem_size_ = mem_size; } @@ -3466,7 +3508,14 @@ Status TensorrtExecutionProvider::CreateNodeComputeInfoFromGraph(const GraphView // Set INT8 Per Tensor Dynamic range if (trt_state->int8_enable && trt_builder->platformHasFastInt8() && trt_state->int8_calibration_cache_available) { +#if defined(_MSC_VER) +#pragma warning(push) +#pragma warning(disable : 4996) +#endif trt_config->setInt8Calibrator(nullptr); +#if defined(_MSC_VER) +#pragma warning(pop) +#endif if (!SetDynamicRange(*trt_state->network->get(), trt_state->dynamic_range_map)) { return ORT_MAKE_STATUS(ONNXRUNTIME, EP_FAIL, "TensorRT EP failed to set INT8 dynamic range."); } @@ -3734,7 +3783,14 @@ Status TensorrtExecutionProvider::CreateNodeComputeInfoFromGraph(const GraphView // Set execution context memory if (trt_state->context_memory_sharing_enable) { +#if defined(_MSC_VER) +#pragma warning(push) +#pragma warning(disable : 4996) +#endif size_t mem_size = trt_engine->getDeviceMemorySize(); +#if defined(_MSC_VER) +#pragma warning(pop) +#endif if (mem_size > *max_context_mem_size_ptr) { *max_context_mem_size_ptr = mem_size; } @@ -3865,7 +3921,14 @@ Status TensorrtExecutionProvider::CreateNodeComputeInfoFromPrecompiledEngine(con // Note: Creating an execution context from an engine is thread safe per TRT doc // https://docs.nvidia.com/deeplearning/tensorrt/developer-guide/index.html#threading if (context_memory_sharing_enable_) { +#if defined(_MSC_VER) +#pragma warning(push) +#pragma warning(disable : 4996) +#endif size_t mem_size = trt_engine->getDeviceMemorySize(); +#if defined(_MSC_VER) +#pragma warning(pop) +#endif if (mem_size > max_ctx_mem_size_) { max_ctx_mem_size_ = mem_size; } @@ -4038,7 +4101,14 @@ Status TensorrtExecutionProvider::CreateNodeComputeInfoFromPrecompiledEngine(con // Set execution context memory if (trt_state->context_memory_sharing_enable) { +#if defined(_MSC_VER) +#pragma warning(push) +#pragma warning(disable : 4996) +#endif size_t mem_size = trt_engine->getDeviceMemorySize(); +#if defined(_MSC_VER) +#pragma warning(pop) +#endif if (mem_size > *max_context_mem_size_ptr) { *max_context_mem_size_ptr = mem_size; } diff --git a/onnxruntime/core/providers/vitisai/imp/ep_context_utils.cc b/onnxruntime/core/providers/vitisai/imp/ep_context_utils.cc new file mode 100644 index 0000000000000..ab31aa313cf6d --- /dev/null +++ b/onnxruntime/core/providers/vitisai/imp/ep_context_utils.cc @@ -0,0 +1,682 @@ +// Standard headers/libs. +#include +#include +#include +#include + +// 3rd-party headers/libs. +#include + +#include "ep_context_utils.h" + +namespace onnxruntime { + +constexpr const char* kVitisAI = "vitisai"; + +std::unique_ptr ConvertIndexedSubGraphToFunctionProto( + const IndexedSubGraph& sub_graph, const Graph& parent_graph) { + auto p_func_proto = ONNX_NAMESPACE::FunctionProto::Create(); + auto* p_meta_def = const_cast(sub_graph.GetMetaDef()); + if (p_meta_def) { + p_func_proto->set_name(p_meta_def->name()); + p_func_proto->set_domain(p_meta_def->domain()); + for (const auto& input : p_meta_def->inputs()) { + p_func_proto->add_input(input); + } + auto* p_metadata_props_0 = p_func_proto->add_metadata_props(); + *(p_metadata_props_0->mutable_key()) = "meta_def_inputs_size"; + *(p_metadata_props_0->mutable_value()) = std::to_string(p_meta_def->inputs().size()); + for (const auto& output : p_meta_def->outputs()) { + p_func_proto->add_output(output); + } + // XXX: SerDes with different fields. + for (const auto& initializer : p_meta_def->constant_initializers()) { + p_func_proto->add_input(initializer); + } + // XXX: SerDes with different numbers of fields. + for (const auto& attr_pair : p_meta_def->attributes()) { + p_func_proto->add_attribute(attr_pair.first); + auto* p_attr_proto = p_func_proto->add_attribute_proto(); + *p_attr_proto = attr_pair.second; + } + p_func_proto->set_doc_string(p_meta_def->doc_string()); + // "since_version" + auto* p_metadata_props_1 = p_func_proto->add_metadata_props(); + *(p_metadata_props_1->mutable_key()) = "meta_def_since_version"; + *(p_metadata_props_1->mutable_value()) = std::to_string(p_meta_def->since_version()); + // "status" + auto* p_metadata_props_2 = p_func_proto->add_metadata_props(); + *(p_metadata_props_2->mutable_key()) = "meta_def_status"; + *(p_metadata_props_2->mutable_value()) = + std::to_string(static_cast(p_meta_def->status())); + // TODO: `MetaDef::type_and_shape_inference_function`. + } + auto p_parent_graph_proto = parent_graph.ToGraphProto(); + for (auto node_index : const_cast(sub_graph).Nodes()) { + auto* p_node_proto = p_parent_graph_proto->mutable_node(static_cast(node_index)); + auto* p_attr_proto = p_node_proto->add_attribute(); + p_attr_proto->set_name("parent_graph_node_index"); + p_attr_proto->set_type(ONNX_NAMESPACE::AttributeProto::INT); + p_attr_proto->set_i(node_index); + *(p_func_proto->add_node()) = *p_node_proto; + } +#if 0 + // Alternative. + for (const auto node_index : sub_graph.Nodes()) { + const auto* p_node = parent_graph.GetNode(node_index); + auto p_node_proto = ONNX_NAMESPACE::NodeProto::Create(); + // XXX + p_node->ToProto(*p_node_proto, true); + auto* p_attr_proto = p_node_proto->add_attribute(); + p_attr_proto->set_name("parent_graph_node_index"); + p_attr_proto->set_type(ONNX_NAMESPACE::AttributeProto::INT); + p_attr_proto->set_i(node_index); + *(p_func_proto.add_node()) = *p_node_proto; + } +#endif + auto* p_metadata_props_3 = p_func_proto->add_metadata_props(); + *(p_metadata_props_3->mutable_key()) = "schema_source"; + *(p_metadata_props_3->mutable_value()) = + std::to_string(static_cast(sub_graph.GetSchemaSource())); + return p_func_proto; +} + +std::unique_ptr ConvertFunctionProtoToIndexedSubGraph( + const std::unique_ptr& p_func_proto) { + auto p_isg = IndexedSubGraph::Create(); + // "meta_def_inputs_size" (optional) and "schema_source". + int func_metadata_props_size = p_func_proto->metadata_props_size(); + // Precisely, func_metadata_props_size == 4, which implies + // `IndexedSubGraph::meta_def_` is not null and `IndexedSubGraph::nodes` > 1. + if (func_metadata_props_size > 1) { + auto& prop0 = const_cast(p_func_proto->metadata_props(0)); + int isg_meta_def_inputs_size = std::stoi(*(prop0.mutable_value())); + auto p_meta_def = IndexedSubGraph_MetaDef::Create(); + p_meta_def->name() = p_func_proto->name(); + p_meta_def->domain() = p_func_proto->domain(); + auto& prop1 = const_cast(p_func_proto->metadata_props(1)); + p_meta_def->since_version() = std::stoi(*(prop1.mutable_value())); + auto& prop2 = const_cast(p_func_proto->metadata_props(2)); + p_meta_def->status() = static_cast(std::stoi(*(prop2.mutable_value()))); + auto& meta_def_inputs = p_meta_def->inputs(); + for (int i = 0; i < isg_meta_def_inputs_size; i++) { + meta_def_inputs.push_back(p_func_proto->input(i)); + } + auto& meta_def_outputs = p_meta_def->outputs(); + for (int i = 0, l = p_func_proto->output_size(); i < l; i++) { + meta_def_outputs.push_back(p_func_proto->output(i)); + } + auto& meta_def_initializers = p_meta_def->constant_initializers(); + for (int i = isg_meta_def_inputs_size, l = p_func_proto->input_size(); i < l; i++) { + meta_def_initializers.push_back(p_func_proto->input(i)); + } + auto& meta_def_attrs = p_meta_def->attributes(); + for (int i = 0, l = p_func_proto->attribute_size(); i < l; i++) { + meta_def_attrs.emplace(p_func_proto->attribute(i), p_func_proto->attribute_proto(i)); + } + p_meta_def->doc_string() = p_func_proto->doc_string(); + // TODO: `IndexedSubGraph::type_and_shape_inference_function`. + p_isg->SetMetaDef(std::move(p_meta_def)); + } + auto& isg_nodes = p_isg->Nodes(); + for (int i = 0, l = p_func_proto->node_size(); i < l; i++) { + const auto& node_proto = p_func_proto->node(i); + isg_nodes.push_back( + node_proto.attribute(const_cast(node_proto).attribute_size() - 1).i()); + } + auto schema_source = static_cast( + std::stoi(*(const_cast(p_func_proto->metadata_props(func_metadata_props_size - 1)).mutable_value()))); + p_isg->SetSchemaSource(schema_source); + return p_isg; +} + +std::string SerializeCapabilities( + const std::vector>& capability_ptrs, + const Graph& graph) { + std::stringstream ss; + for (const auto& p : capability_ptrs) { + auto& p_subgraph = p->SubGraph(); + auto p_func_proto = ConvertIndexedSubGraphToFunctionProto(*p_subgraph, graph); + std::string func_proto_buf; + p_func_proto->SerializeToString(func_proto_buf); + size_t buf_len = func_proto_buf.length(); + ss.write(reinterpret_cast(&buf_len), sizeof(buf_len)); + ss.write(func_proto_buf.data(), buf_len); + } + if (!ss.good()) { + ORT_THROW("Serialization stream bad"); + } + return ss.str(); +} + +void DeserializeCapabilities(const std::string& ser_capabilities, + std::vector>& capability_ptrs) { + std::istringstream ss(ser_capabilities); + while (!ss.eof()) { + size_t buf_len; + ss.read(reinterpret_cast(&buf_len), sizeof(buf_len)); + std::string buf(buf_len, '\0'); + ss.read(&buf[0], buf_len); + auto p_func_proto = ONNX_NAMESPACE::FunctionProto::Create(); + p_func_proto->ParseFromString(buf); + auto p_subgraph = ConvertFunctionProtoToIndexedSubGraph(p_func_proto); + capability_ptrs.push_back(ComputeCapability::Create(std::move(p_subgraph))); + } +} + +std::string SerializeOrigialGraph(const GraphViewer& graph_viewer) { + // XXX: Will Steps 1/2/3 suffice for restoring a model/graph later? + // Any information loss or mismatch? + // Step 1 + const Graph& orig_graph = graph_viewer.GetGraph(); + // Step 2 + const Model& orig_model = orig_graph.GetModel(); + // Step 3 + auto p_orig_model_proto = const_cast(orig_model).ToProto(); + if (p_orig_model_proto->opset_import_size() == 0) { + for (const auto& it : graph_viewer.DomainToVersionMap()) { + auto* p_opset_import = p_orig_model_proto->add_opset_import(); + *(p_opset_import->mutable_domain()) = it.first; + p_opset_import->set_version(it.second); + } + } + + nlohmann::json j_obj; + if (p_orig_model_proto->opset_import_size() > 0) { + for (int i = 0, n = p_orig_model_proto->opset_import_size(); i < n; ++i) { + auto& op_set_id_proto = const_cast(p_orig_model_proto->opset_import(i)); + j_obj[*op_set_id_proto.mutable_domain()] = std::to_string(op_set_id_proto.version()); + } + } + j_obj["orig_graph_name"] = graph_viewer.Name(); + // TODO: platform dependency (Linux vs Windows). + j_obj["orig_model_path"] = graph_viewer.ModelPath().string(); + + // XXX: `ModelProto::SerializeToString` will lose some info, + // e.g., ModelProto.opset_import. + std::string ser_buf; + p_orig_model_proto->SerializeToString(ser_buf); + j_obj["orig_model_proto_ser_str"] = ser_buf; + + return j_obj.dump(-1, ' ', false, nlohmann::json::error_handler_t::replace); +} + +// Ref.: `CreateEpContextModel()` in the file "graph_partitioner.cc". +ONNX_NAMESPACE::ModelProto* CreateEPContexModel( + const GraphViewer& graph_viewer, + const std::string& serialized_ctx_cache, + const std::string& ctx_cache_file_loc, + const int64_t embed_mode, + const std::string& backend_cache_dir, + const std::string& backend_cache_key, + bool saving_orig_graph, + const logging::Logger* p_logger) { + LOGS_DEFAULT(VERBOSE) << "[VitisAI EP]Creating EP context node"; + // Create a new graph/model, reusing the graph name, + // the op-domain-to-opset-version map, + // and the op schema registry of the current graph. + // XXX: This approach (immediately below) has a memory fault issue (std::bad_alloc). + // auto& ep_ctx_graph = graph_viewer.CreateModel(*p_logger)->MainGraph(); + // This apporach (immediately below) has no memory falut issue. + auto p_temp_model = graph_viewer.CreateModel(*p_logger); + auto& ep_ctx_graph = p_temp_model->MainGraph(); + + const auto& graph_inputs = graph_viewer.GetInputs(); + std::vector input_node_arg_ptrs; + input_node_arg_ptrs.reserve(graph_inputs.size()); + // XXX: vs `GraphViewer::GetInputsIncludingInitializers()`. + for (const auto* p_node_arg : graph_inputs) { + auto& temp_node_arg = ep_ctx_graph.GetOrCreateNodeArg( + p_node_arg->Name(), p_node_arg->TypeAsProto()); + input_node_arg_ptrs.push_back(&temp_node_arg); + } + const auto& graph_outputs = graph_viewer.GetOutputs(); + std::vector output_node_arg_ptrs; + output_node_arg_ptrs.reserve(graph_outputs.size()); + for (const auto* p_node_arg : graph_outputs) { + auto& temp_node_arg = ep_ctx_graph.GetOrCreateNodeArg(p_node_arg->Name(), p_node_arg->TypeAsProto()); + output_node_arg_ptrs.push_back(&temp_node_arg); + } + + // Attr "embed_mode". + auto p_attr_0 = ONNX_NAMESPACE::AttributeProto::Create(); + p_attr_0->set_name(kEmbedModeAttr); + // p_attr_0->set_type(onnx::AttributeProto_AttributeType_INT); + p_attr_0->set_type(ONNX_NAMESPACE::AttributeProto::INT); + p_attr_0->set_i(embed_mode); + // Attr "ep_cache_context". + auto p_attr_1 = ONNX_NAMESPACE::AttributeProto::Create(); + p_attr_1->set_name(kEPCacheContextAttr); + // p_attr_1->set_type(onnx::AttributeProto_AttributeType_STRING); + p_attr_1->set_type(ONNX_NAMESPACE::AttributeProto::STRING); + // Relative to the ONNX model file. + p_attr_1->set_s( + embed_mode == 0 ? fs::path(ctx_cache_file_loc).filename().string() : serialized_ctx_cache); + // Attr "source". + auto p_attr_2 = ONNX_NAMESPACE::AttributeProto::Create(); + p_attr_2->set_name(kSourceAttr); + // p_attr_2->set_type(onnx::AttributeProto_AttributeType_STRING); + p_attr_2->set_type(ONNX_NAMESPACE::AttributeProto::STRING); + p_attr_2->set_s(kVitisAIExecutionProvider); + // Attr "onnx_model_filename". + auto p_attr_3 = ONNX_NAMESPACE::AttributeProto::Create(); + p_attr_3->set_name(kONNXModelFileNameAttr); + // p_attr_3->set_type(onnx::AttributeProto_AttributeType_STRING); + p_attr_3->set_type(ONNX_NAMESPACE::AttributeProto::STRING); + p_attr_3->set_s(graph_viewer.ModelPath().filename().string()); + // Attr "notes". + auto p_attr_4 = ONNX_NAMESPACE::AttributeProto::Create(); + p_attr_4->set_name(kNotesAttr); + // p_attr_4->set_type(onnx::AttributeProto_AttributeType_STRING); + p_attr_4->set_type(ONNX_NAMESPACE::AttributeProto::STRING); + // FIXME: 2G-limit of ProtoBuf. + if (saving_orig_graph) { + p_attr_4->set_s(SerializeOrigialGraph(graph_viewer)); + } else { + nlohmann::json j_obj; + j_obj["backend_cache_dir"] = backend_cache_dir; + j_obj["backend_cache_key"] = backend_cache_key; + p_attr_4->set_s(j_obj.dump(-1, ' ', false, nlohmann::json::error_handler_t::replace)); + } + + auto p_node_attrs = NodeAttributes::Create(); + constexpr int num_attrs = 5; + p_node_attrs->reserve(num_attrs); + p_node_attrs->emplace(kEmbedModeAttr, *p_attr_0); + p_node_attrs->emplace(kEPCacheContextAttr, *p_attr_1); + p_node_attrs->emplace(kSourceAttr, *p_attr_2); + p_node_attrs->emplace(kONNXModelFileNameAttr, *p_attr_3); + p_node_attrs->emplace(kNotesAttr, *p_attr_4); + + // Since we don't implement `IExecutionProvider::GetEpContextNodes()` and + // thus don't leverage `CreateEpContextModel()` in the file "graph_partitioner.cc", + // we specify a brand-new node name here. + ep_ctx_graph.AddNode(kEPContextOpName, kEPContextOp, "", input_node_arg_ptrs, output_node_arg_ptrs, p_node_attrs.get(), kEPContextOpDomain); + + auto res_status = ep_ctx_graph.Resolve(); + ORT_ENFORCE(res_status.IsOK(), res_status.ErrorMessage()); + LOGS_DEFAULT(VERBOSE) << "Created EP context model graph resolved"; + + auto p_ep_ctx_graph_viewer = ep_ctx_graph.CreateGraphViewer(); + auto p_temp_model_2 = p_ep_ctx_graph_viewer->CreateModel(*p_logger); + auto p_ep_ctx_model_proto = p_temp_model_2->ToProto(); + p_ep_ctx_graph_viewer->ToProto(*p_ep_ctx_model_proto->mutable_graph(), true, true); + p_ep_ctx_model_proto->set_ir_version(ONNX_NAMESPACE::Version::IR_VERSION); + + return p_ep_ctx_model_proto.release(); +} + +// Ref.: `static common::Status Save(Model& model, int fd)` in the file "model.h". +void DumpEPContextModel( + const std::unique_ptr& p_model_proto, const std::string& ep_ctx_model_file_loc) { + std::fstream dump_stream(ep_ctx_model_file_loc, std::ios::out | std::ios::trunc | std::ios::binary); + p_model_proto->SerializeToOstream(dump_stream); + LOGS_DEFAULT(VERBOSE) << "[VitisAI EP] Dumped " << ep_ctx_model_file_loc; +} + +const Node* GetEPContextNodePtr(const Graph& graph) { + // TODO: Support for multi-node EP context model. + for (const auto* p_node : graph.Nodes()) { + if (p_node->OpType() == kEPContextOp) { + return p_node; + } + } + return nullptr; +} + +bool ValidateEPContextNode(const Graph& graph) { + // TODO: Support for multi-node EP context model. + const auto* p_node = GetEPContextNodePtr(graph); + assert(p_node != nullptr); + auto& attrs = p_node->GetAttributes(); + assert(attrs.count(kEmbedModeAttr) > 0); + assert(attrs.count(kEPCacheContextAttr) > 0); + assert(attrs.count(kSourceAttr) > 0); + const auto& source_val = attrs.at(kSourceAttr).s(); + if (source_val == kVitisAIExecutionProvider) { + return true; + } + size_t vitisai_len = std::strlen(kVitisAI); + assert(source_val.length() == vitisai_len); + for (size_t i = 0; i < vitisai_len; ++i) { + assert(static_cast(std::tolower(source_val[i])) == kVitisAI[i]); + } + return true; +} + +// Ref.: `CreateEpContextModel()` in the file "graph_partitioner.cc". +void CreateEPContexNodes( + Graph* p_ep_ctx_graph, + const std::vector& fused_nodes_and_graphs, + const std::string& serialized_ctx_cache, + const std::string& ctx_cache_file_loc, + const int64_t embed_mode, + const std::string& backend_cache_dir, + const std::string& backend_cache_key, + bool saving_orig_graph, + const logging::Logger* p_logger) { + LOGS_DEFAULT(VERBOSE) << "[VitisAI EP]Creating EP context nodes"; + int fused_index = 0; + for (const auto& fused_node_graph : fused_nodes_and_graphs) { + Node& fused_node = fused_node_graph.fused_node; + const auto& fused_name = fused_node.Name(); + const GraphViewer& graph_viewer = fused_node_graph.filtered_graph; + // FIXME + const auto& graph_inputs = graph_viewer.GetInputs(); + std::vector input_node_arg_ptrs; + input_node_arg_ptrs.reserve(graph_inputs.size()); + // XXX: vs `GraphViewer::GetInputsIncludingInitializers()`. + for (const auto* p_node_arg : graph_inputs) { + auto& temp_node_arg = p_ep_ctx_graph->GetOrCreateNodeArg( + p_node_arg->Name(), p_node_arg->TypeAsProto()); + input_node_arg_ptrs.push_back(&temp_node_arg); + } + const auto& graph_outputs = graph_viewer.GetOutputs(); + std::vector output_node_arg_ptrs; + output_node_arg_ptrs.reserve(graph_outputs.size()); + for (const auto* p_node_arg : graph_outputs) { + auto& temp_node_arg = p_ep_ctx_graph->GetOrCreateNodeArg(p_node_arg->Name(), p_node_arg->TypeAsProto()); + output_node_arg_ptrs.push_back(&temp_node_arg); + } + + auto p_node_attrs = NodeAttributes::Create(); + if (fused_index == 0) { + p_node_attrs->reserve(7); + // Attr "ep_cache_context". + auto p_attr_1 = ONNX_NAMESPACE::AttributeProto::Create(); + p_attr_1->set_name(kEPCacheContextAttr); + p_attr_1->set_type(ONNX_NAMESPACE::AttributeProto::STRING); + // Relative to the ONNX model file. + p_attr_1->set_s( + embed_mode == 0 ? fs::path(ctx_cache_file_loc).filename().string() : serialized_ctx_cache); + p_node_attrs->emplace(kEPCacheContextAttr, *p_attr_1); + // Attr "notes". + auto p_attr_4 = ONNX_NAMESPACE::AttributeProto::Create(); + p_attr_4->set_name(kNotesAttr); + p_attr_4->set_type(ONNX_NAMESPACE::AttributeProto::STRING); + // FIXME: 2G-limit of ProtoBuf. + if (saving_orig_graph) { + p_attr_4->set_s(SerializeOrigialGraph(graph_viewer)); + } else { + nlohmann::json j_obj; + j_obj["backend_cache_dir"] = backend_cache_dir; + j_obj["backend_cache_key"] = backend_cache_key; + p_attr_4->set_s(j_obj.dump(-1, ' ', false, nlohmann::json::error_handler_t::replace)); + } + p_node_attrs->emplace(kNotesAttr, *p_attr_4); + // Attr "main_context". + auto p_attr_5 = ONNX_NAMESPACE::AttributeProto::Create(); + p_attr_5->set_name(kMainContextAttr); + p_attr_5->set_type(ONNX_NAMESPACE::AttributeProto::INT); + p_attr_5->set_i(1); + p_node_attrs->emplace(kMainContextAttr, *p_attr_5); + } else { + p_node_attrs->reserve(5); + // Attr "main_context". + auto p_attr_5 = ONNX_NAMESPACE::AttributeProto::Create(); + p_attr_5->set_name(kMainContextAttr); + p_attr_5->set_type(ONNX_NAMESPACE::AttributeProto::INT); + p_attr_5->set_i(0); + p_node_attrs->emplace(kMainContextAttr, *p_attr_5); + } + // Attr "embed_mode". + auto p_attr_0 = ONNX_NAMESPACE::AttributeProto::Create(); + p_attr_0->set_name(kEmbedModeAttr); + p_attr_0->set_type(ONNX_NAMESPACE::AttributeProto::INT); + p_attr_0->set_i(embed_mode); + p_node_attrs->emplace(kEmbedModeAttr, *p_attr_0); + // Attr "source". + auto p_attr_2 = ONNX_NAMESPACE::AttributeProto::Create(); + p_attr_2->set_name(kSourceAttr); + p_attr_2->set_type(ONNX_NAMESPACE::AttributeProto::STRING); + p_attr_2->set_s(kVitisAIExecutionProvider); + p_node_attrs->emplace(kSourceAttr, *p_attr_2); + // Attr "onnx_model_filename". + auto p_attr_3 = ONNX_NAMESPACE::AttributeProto::Create(); + p_attr_3->set_name(kONNXModelFileNameAttr); + p_attr_3->set_type(ONNX_NAMESPACE::AttributeProto::STRING); + p_attr_3->set_s(graph_viewer.ModelPath().filename().string()); + p_node_attrs->emplace(kONNXModelFileNameAttr, *p_attr_3); + // Attr "partition_name". + auto p_attr_6 = ONNX_NAMESPACE::AttributeProto::Create(); + p_attr_6->set_name(kPartitionNameAttr); + p_attr_6->set_type(ONNX_NAMESPACE::AttributeProto::STRING); + p_attr_6->set_s(fused_name); + p_node_attrs->emplace(kPartitionNameAttr, *p_attr_6); + + p_ep_ctx_graph->AddNode(fused_name, kEPContextOp, "", input_node_arg_ptrs, output_node_arg_ptrs, p_node_attrs.get(), kEPContextOpDomain); + + ++fused_index; + } + auto res_status = p_ep_ctx_graph->Resolve(); + ORT_ENFORCE(res_status.IsOK(), res_status.ErrorMessage()); + LOGS_DEFAULT(VERBOSE) << "Created EP context model graph resolved"; +} + +std::string RetrieveEPContextCache( + const Graph& graph, const PathString& ep_ctx_model_loc, bool binary_mode) { + // TODO: Support for multi-node EP context model. + const auto* p_node = GetEPContextNodePtr(graph); + const auto& attrs = p_node->GetAttributes(); + int64_t embed_mode = attrs.at(kEmbedModeAttr).i(); + const std::string& ep_ctx_cache = attrs.at(kEPCacheContextAttr).s(); + if (embed_mode) { + return ep_ctx_cache; + } + fs::path ep_ctx_fs_path(ep_ctx_model_loc); + // Attr "ep_cache_context" stores a relative path. + ep_ctx_fs_path.replace_filename(fs::path(ep_ctx_cache)); + // TODO: Validaion of the file location to make sure security is met. + if (!fs::exists(ep_ctx_fs_path) || !fs::is_regular_file(ep_ctx_fs_path)) { + ORT_THROW("File for EP context cache is missing"); + } + auto open_mode = binary_mode ? (std::ios::in | std::ios::binary) : std::ios::in; + std::ifstream ifs(ep_ctx_fs_path.string().c_str(), open_mode); + if (!ifs.is_open()) { + ORT_THROW("Exception opening EP context cache file"); + } + ifs.seekg(0, ifs.end); + std::streampos cache_len = ifs.tellg(); + if (cache_len == -1) { + ifs.close(); + ORT_THROW("Error when operating EP context cache file"); + } else if (cache_len == 0) { + ifs.close(); + LOGS_DEFAULT(WARNING) << "Empty EP context cache file: " << ep_ctx_fs_path.string(); + return ""; + } + ifs.seekg(0, ifs.beg); + char* buf = new char[static_cast(cache_len)]; + ifs.read(buf, cache_len); + if (!ifs.good()) { + ifs.close(); + ORT_THROW("Exception reading EP context cache file"); + } + ifs.close(); + std::string cache_payload(buf); + delete[] buf; + return cache_payload; +} + +void RetrieveBackendCacheInfo(const Graph& graph, std::string& cache_dir, std::string& cache_key) { + // TODO: Support for multi-node EP context model. + const auto* p_node = GetEPContextNodePtr(graph); + if (p_node == nullptr) { + LOGS_DEFAULT(WARNING) << "Failed to retrieve cache info due to no EP context nodes"; + return; + } + const auto& attrs = p_node->GetAttributes(); + const auto& notes_str = attrs.at(kNotesAttr).s(); + nlohmann::json j_obj = nlohmann::json::parse(notes_str); + cache_dir = j_obj["backend_cache_dir"].get(); + cache_key = j_obj["backend_cache_key"].get(); + if (cache_dir.empty()) { + LOGS_DEFAULT(WARNING) << "Retrieved backend cache dir empty"; + } + if (cache_key.empty()) { + LOGS_DEFAULT(WARNING) << "Retrieved backend cache key empty"; + } +} + +std::unique_ptr RetrieveOriginalGraph(const Graph& ep_ctx_graph) { + // TODO: Support for multi-node EP context model. + const auto* p_node = GetEPContextNodePtr(ep_ctx_graph); + const auto& attrs = p_node->GetAttributes(); + const auto& notes_str = attrs.at(kNotesAttr).s(); + nlohmann::json j_obj = nlohmann::json::parse(notes_str); + + const auto& orig_model_path = j_obj["orig_model_path"].get(); + bool model_loaded = false; + auto p_model_proto = ONNX_NAMESPACE::ModelProto::Create(); + if (!orig_model_path.empty() && fs::exists(orig_model_path) && fs::is_regular_file(orig_model_path)) { + auto load_status = Model::Load(ToPathString(orig_model_path), *p_model_proto); + model_loaded = load_status.IsOK(); + } + if (!model_loaded) { + p_model_proto->ParseFromString(j_obj["orig_model_proto_ser_str"].get()); + if (p_model_proto->opset_import_size() == 0) { + for (auto& elem : j_obj.items()) { + if (elem.key() == "orig_model_path" || elem.key() == "orig_graph_name" || elem.key() == "orig_model_proto_ser_str") { + continue; + } + auto* p_op_set_id_proto = p_model_proto->add_opset_import(); + *(p_op_set_id_proto->mutable_domain()) = elem.key(); + p_op_set_id_proto->set_version(std::stoll(elem.value().get())); + } + } + } + auto& logger = logging::LoggingManager::DefaultLogger(); + auto p_model = Model::Create(std::move(*p_model_proto), ToPathString(orig_model_path), nullptr, logger); + auto& graph = p_model->MainGraph(); + graph.ToGraphProto()->set_name(j_obj["orig_graph_name"].get()); + + return graph.CreateGraphViewer(); +} + +bool GraphHasEPContextNode(const Graph& graph) { + size_t vitisai_len = std::strlen(kVitisAI); + for (const auto* p_node : graph.Nodes()) { + if (p_node->OpType() != kEPContextOp) { + continue; + } + const auto& attrs = p_node->GetAttributes(); + if (attrs.count(kSourceAttr) == 0) { + continue; + } + const auto& source_val = attrs.at(kSourceAttr).s(); + if (source_val == kVitisAIExecutionProvider) { + return true; + } + if (source_val.length() != vitisai_len) { + continue; + } + size_t j = 0; + do { + if (static_cast(std::tolower(source_val[j])) != kVitisAI[j]) { + break; + } + ++j; + } while (j < vitisai_len); + if (j == vitisai_len) { + return true; + } + } + return false; +} + +bool FusedGraphHasEPContextNode( + const std::vector& fused_nodes_and_graphs) { + for (const auto& fused_node_graph : fused_nodes_and_graphs) { + bool has_node = GraphHasEPContextNode(fused_node_graph.filtered_graph.get().GetGraph()); + if (has_node) { + return true; + } + } + return false; +} + +const fs::path& GetTopLevelModelPath(const GraphViewer& graph_viewer) { + const auto& graph = graph_viewer.GetGraph(); + const Graph* p_graph = &graph; + while (p_graph->IsSubgraph()) { + p_graph = p_graph->ParentGraph(); + } + return p_graph->ModelPath(); +} + +bool GetEPContextModelFileLocation( + const std::string& ep_ctx_model_path_cfg, + const PathString& model_path_str, + bool is_ep_ctx_model, + PathString& ep_ctx_model_file_loc) { + if (!ep_ctx_model_file_loc.empty()) { + return true; + } + if (!ep_ctx_model_path_cfg.empty()) { + ep_ctx_model_file_loc = ToPathString(ep_ctx_model_path_cfg); + } else if (!model_path_str.empty()) { + if (is_ep_ctx_model) { + ep_ctx_model_file_loc = model_path_str; + } else { + // Two alternatives for this case. + // Alternative 1: + // 1) Implement/override the method `IExecutionProvider::GetEpContextNodes()`. + // 2) And follow how the default path is implemented in `CreateEpContextModel()` + // in the file "graph_partitioner.cc". + // 3) Model dump is not required. + // Alternative 2: + // 1) Do NOT implement/override `IExecutionProvider::GetEpContextNodes()`. + // 2) No need to follow `CreateEpContextModel()` in the file "graph_partitioner.cc", + // freely implement what the default path is like. + // 3) Model dump is required. +#if 0 + ep_ctx_model_file_loc = model_path_str + ToPathString("_ctx.onnx"); +#endif +#if 1 + fs::path model_fs_path(model_path_str); + fs::path ep_ctx_model_fs_path(model_fs_path.parent_path() / model_fs_path.stem()); + ep_ctx_model_fs_path += fs::path("_ctx.onnx"); + ep_ctx_model_file_loc = ToPathString(ep_ctx_model_fs_path.string()); +#endif + } + } + return !ep_ctx_model_file_loc.empty(); +} + +// The file for EP context cache is in the same folder as the EP context model file. +PathString GetEPContextCacheFileLocation( + const PathString& ep_ctx_model_file_loc, const PathString& model_path_str) { + if (!ep_ctx_model_file_loc.empty()) { + fs::path ep_ctx_model_fs_path(ep_ctx_model_file_loc); + fs::path ep_ctx_cache_fs_path(ep_ctx_model_fs_path.parent_path() / ep_ctx_model_fs_path.stem()); + ep_ctx_cache_fs_path += fs::path("__ep_ctx_cache.bin"); + return ToPathString(ep_ctx_cache_fs_path.string()); + } + fs::path model_fs_path(model_path_str); + fs::path ep_ctx_cache_fs_path(model_fs_path.parent_path() / model_fs_path.stem()); + ep_ctx_cache_fs_path += fs::path("__ep_ctx_cache.bin"); + return ToPathString(ep_ctx_cache_fs_path.string()); +} + +std::string Slurp(const fs::path& file_location, bool binary_mode) { + // std::filesystem::value_type == onnxruntime::PathChar == ORTCHAR_T + // std::filesystem::string_type == onnxruntime::PathString + // const char* location_str = PathToUTF8String(file_location.native()).c_str(); + std::ifstream ifs; + ifs.exceptions(std::ifstream::failbit | std::ifstream::badbit); + std::stringstream ss; + try { + auto open_mode = binary_mode ? (std::ios::in | std::ios::binary) : std::ios::in; + ifs.open(file_location.string().c_str(), open_mode); + ss << ifs.rdbuf(); + if (!ss.good()) { + LOGS_DEFAULT(WARNING) << "Failed to write to stream"; + } + ifs.close(); + } catch (std::system_error& se) { + LOGS_DEFAULT(WARNING) << "Failed to read " << file_location << ": " << se.code().message(); + } + return ss.str(); +} + +} // namespace onnxruntime diff --git a/onnxruntime/core/providers/vitisai/imp/global_api.cc b/onnxruntime/core/providers/vitisai/imp/global_api.cc index e9ae93ded40c7..8c1dce0d3dc1a 100644 --- a/onnxruntime/core/providers/vitisai/imp/global_api.cc +++ b/onnxruntime/core/providers/vitisai/imp/global_api.cc @@ -53,6 +53,8 @@ struct OrtVitisAIEpAPI { std::vector>* (*compile_onnx_model_with_options)( const std::string& model_path, const onnxruntime::Graph& graph, const onnxruntime::ProviderOptions& options); uint32_t (*vaip_get_version)(); + void (*get_backend_compilation_cache)(const std::string& model_path, const onnxruntime::Graph& graph, const char* json_config, uint8_t compiler_codes, std::string& cache_dir, std::string& cache_key, std::string& cache_data); + void (*restore_backend_compilation_cache)(const std::string& cache_dir, const std::string& cache_key, const std::string& cache_data, const std::string& model_path); void Ensure() { if (handle_) return; @@ -77,6 +79,8 @@ struct OrtVitisAIEpAPI { } std::ignore = env.GetSymbolFromLibrary(handle_, "vaip_get_version", (void**)&vaip_get_version); + ORT_THROW_IF_ERROR(env.GetSymbolFromLibrary(handle_, "get_compilation_cache", (void**)&get_backend_compilation_cache)); + ORT_THROW_IF_ERROR(env.GetSymbolFromLibrary(handle_, "restore_compilation_cache", (void**)&restore_backend_compilation_cache)); } private: @@ -122,13 +126,7 @@ static std::string config_to_json_str(const onnxruntime::ProviderOptions& config vaip_core::DllSafe>> compile_onnx_model( const onnxruntime::GraphViewer& graph_viewer, const logging::Logger& logger, const ProviderOptions& options) { -#ifndef _WIN32 auto model_path = graph_viewer.ModelPath().string(); -#else - using convert_t = std::codecvt_utf8; - std::wstring_convert strconverter; - auto model_path = strconverter.to_bytes(graph_viewer.ModelPath().string()); -#endif if (s_library_vitisaiep.compile_onnx_model_with_options) { return vaip_core::DllSafe(s_library_vitisaiep.compile_onnx_model_with_options(model_path, graph_viewer.GetGraph(), options)); } else { @@ -137,6 +135,17 @@ vaip_core::DllSafe>> c } } +void get_backend_compilation_cache(const onnxruntime::PathString& model_path_str, const onnxruntime::GraphViewer& graph_viewer, const onnxruntime::ProviderOptions& options, uint8_t compiler_codes, std::string& cache_dir, std::string& cache_key, std::string& cache_data) { + const std::string& model_path = PathToUTF8String(model_path_str); + const onnxruntime::Graph& graph = graph_viewer.GetGraph(); + const auto json_str = config_to_json_str(options); + s_library_vitisaiep.get_backend_compilation_cache(model_path, graph, json_str.c_str(), compiler_codes, cache_dir, cache_key, cache_data); +} + +void restore_backend_compilation_cache(const std::string& cache_dir, const std::string& cache_key, const std::string& cache_data, const std::string& model_path) { + s_library_vitisaiep.restore_backend_compilation_cache(cache_dir, cache_key, cache_data, model_path); +} + struct MyCustomOpKernel : OpKernel { MyCustomOpKernel(const OpKernelInfo& info, const OrtCustomOp& op) : OpKernel(info), op_(op) { op_kernel_ = @@ -218,9 +227,9 @@ vaip_core::OrtApiForVaip* create_org_api_hook() { auto& logger = logging::LoggingManager::DefaultLogger(); auto& model = const_cast(const_model); auto model_proto = model.ToProto(); - auto file_path = model.MainGraph().ModelPath().string(); + auto file_path = model.MainGraph().ModelPath(); auto local_registries = IOnnxRuntimeOpSchemaRegistryList{model.MainGraph().GetSchemaRegistry()}; - auto ret = Model::Create(std::move(*model_proto), file_path, &local_registries, logger); + auto ret = Model::Create(std::move(*model_proto), ToPathString(file_path), &local_registries, logger); auto status = ret->MainGraph().Resolve(); vai_assert(status.IsOK(), status.ErrorMessage()); return ret.release(); diff --git a/onnxruntime/core/providers/vitisai/imp/graph.cc b/onnxruntime/core/providers/vitisai/imp/graph.cc index 40b396fda6135..3f46fbde8c714 100644 --- a/onnxruntime/core/providers/vitisai/imp/graph.cc +++ b/onnxruntime/core/providers/vitisai/imp/graph.cc @@ -107,12 +107,11 @@ void graph_save(const Graph& graph, const std::string& filename, const std::stri auto graph_proto_subgraph = graph.ToGraphProto(); *model_proto->mutable_graph() = *graph_proto_subgraph; auto& logger = logging::LoggingManager::DefaultLogger(); - auto filename_data_relative_path = std::filesystem::path(); auto model = Model::Create(std::move(*model_proto), ToPathString(filename), nullptr, logger); if (initializer_size_threshold == std::numeric_limits::max()) { model_proto = model->ToProto(); } else { - model_proto = model->ToGraphProtoWithExternalInitializers(filename_dat, graph.ModelPath(), initializer_size_threshold); + model_proto = model->ToGraphProtoWithExternalInitializers(ToPathString(filename_dat), ToPathString(filename), initializer_size_threshold); } auto& metadata = model->MetaData(); if (!metadata.empty()) { @@ -124,7 +123,7 @@ void graph_save(const Graph& graph, const std::string& filename, const std::stri *prop->mutable_value() = m.second; } } - std::fstream output(filename, std::ios::out | std::ios::trunc | std::ios::binary); + std::fstream output(ToPathString(filename), std::ios::out | std::ios::trunc | std::ios::binary); bool result = model_proto->SerializeToOstream(output); output << std::flush; vai_assert(result, "model serialize to ostream error"); diff --git a/onnxruntime/core/providers/vitisai/include/ep_context_utils.h b/onnxruntime/core/providers/vitisai/include/ep_context_utils.h new file mode 100644 index 0000000000000..61a595cf1ae15 --- /dev/null +++ b/onnxruntime/core/providers/vitisai/include/ep_context_utils.h @@ -0,0 +1,81 @@ +#pragma once + +// Standard headers/libs. +#include +#include +#include +#include + +// 1st-party headers/libs. +#include "core/providers/shared_library/provider_api.h" + +namespace fs = std::filesystem; + +namespace onnxruntime { + +constexpr const uint8_t kXCCode = 1; +constexpr const uint8_t kDDCode = 2; +constexpr const uint8_t kVCode = 4; + +static constexpr const char* kEPContextOp = "EPContext"; +static constexpr const char* kMainContextAttr = "main_context"; +static constexpr const char* kEPCacheContextAttr = "ep_cache_context"; +static constexpr const char* kEmbedModeAttr = "embed_mode"; +static constexpr const char* kPartitionNameAttr = "partition_name"; +static constexpr const char* kSourceAttr = "source"; +static constexpr const char* kEPSDKVersionAttr = "ep_sdk_version"; +static constexpr const char* kONNXModelFileNameAttr = "onnx_model_filename"; +static constexpr const char* kNotesAttr = "notes"; +static constexpr const char* kEPContextOpDomain = "com.microsoft"; +static constexpr const char* kEPContextOpName = "VitisAIEPContextOp"; + +std::unique_ptr +ConvertIndexedSubGraphToFunctionProto(const IndexedSubGraph&, const Graph&); + +std::unique_ptr ConvertFunctionProtoToIndexedSubGraph( + const std::unique_ptr&); + +std::string SerializeCapabilities( + const std::vector>&, const Graph&); + +void DeserializeCapabilities( + const std::string&, std::vector>&); + +std::string SerializeOrigialGraph(const GraphViewer&); + +// Ref.: `CreateEpContextModel()` in the file "graph_partitioner.cc". +ONNX_NAMESPACE::ModelProto* CreateEPContexModel(const GraphViewer&, const std::string&, const std::string&, const int64_t, + const std::string&, const std::string&, bool, const logging::Logger*); + +// Ref.: `static common::Status Save(Model& model, int fd)` in the file "model.h". +void DumpEPContextModel(const std::unique_ptr&, const std::string&); + +const Node* GetEPContextNodePtr(const Graph&); + +bool ValidateEPContextNode(const Graph&); + +void CreateEPContexNodes(Graph*, const std::vector&, const std::string&, const std::string&, + const int64_t, const std::string&, const std::string&, bool, const logging::Logger*); + +std::string RetrieveEPContextCache(const Graph&, const PathString&, bool binary_mode = true); + +void RetrieveBackendCacheInfo(const Graph&, std::string&, std::string&); + +std::unique_ptr RetrieveOriginalGraph(const Graph&); + +bool GraphHasEPContextNode(const Graph&); + +bool FusedGraphHasEPContextNode( + const std::vector&); + +const fs::path& GetTopLevelModelPath(const GraphViewer&); + +bool GetEPContextModelFileLocation( + const std::string&, const PathString&, bool, PathString&); + +// The file for EP context cache is in the same folder as the EP context model file. +PathString GetEPContextCacheFileLocation(const PathString&, const PathString&); + +std::string Slurp(const fs::path&, bool binary_mode = false); + +} // namespace onnxruntime diff --git a/onnxruntime/core/providers/vitisai/include/vaip/global_api.h b/onnxruntime/core/providers/vitisai/include/vaip/global_api.h index 1f8b8802e86b4..3fdbc60bb0ee6 100644 --- a/onnxruntime/core/providers/vitisai/include/vaip/global_api.h +++ b/onnxruntime/core/providers/vitisai/include/vaip/global_api.h @@ -14,3 +14,5 @@ void initialize_vitisai_ep(); vaip_core::DllSafe>> compile_onnx_model(const onnxruntime::GraphViewer& graph_viewer, const onnxruntime::logging::Logger& logger, const onnxruntime::ProviderOptions& options); std::shared_ptr get_kernel_registry_vitisaiep(); const std::vector& get_domains_vitisaiep(); +void get_backend_compilation_cache(const onnxruntime::PathString& model_path_str, const onnxruntime::GraphViewer& graph_viewer, const onnxruntime::ProviderOptions& options, uint8_t compiler_codes, std::string& cache_dir, std::string& cache_key, std::string& cache_data); +void restore_backend_compilation_cache(const std::string& cache_dir, const std::string& cache_key, const std::string& cache_data, const std::string& model_path); diff --git a/onnxruntime/core/providers/vitisai/vitisai_execution_provider.cc b/onnxruntime/core/providers/vitisai/vitisai_execution_provider.cc index 6fc09f3495aa1..f45b89649bfcb 100644 --- a/onnxruntime/core/providers/vitisai/vitisai_execution_provider.cc +++ b/onnxruntime/core/providers/vitisai/vitisai_execution_provider.cc @@ -2,22 +2,43 @@ // Licensed under the MIT License. #include "vitisai_execution_provider.h" +// Standard headers/libs. #include #include #include +#include + +// 1st-party headers/libs. +#include "core/platform/env_var_utils.h" +#include "core/common/exceptions.h" #include "vaip/capability.h" #include "vaip/global_api.h" +#include "ep_context_utils.h" using namespace ONNX_NAMESPACE; +namespace fs = std::filesystem; + namespace onnxruntime { constexpr const char* VITISAI = "VITISAI"; VitisAIExecutionProvider::VitisAIExecutionProvider( const ProviderOptions& info) + // const ProviderOptions& info, const SessionOptions* p_sess_opts) : IExecutionProvider{onnxruntime::kVitisAIExecutionProvider}, info_(info) { CreateKernelRegistry(); + + auto it = info_.find("ep_context_enable"); + ep_ctx_enabled_ = it != info_.end() && it->second == "1"; + it = info_.find("ep_context_embed_mode"); + ep_ctx_embed_mode_ = it != info_.end() && it->second != "0"; + // ep_ctx_embed_mode_ = it == info_.end() || it->second != "0"; + it = info_.find("ep_context_file_path"); + ep_ctx_model_path_cfg_ = it == info_.end() ? "" : it->second; + LOGS_DEFAULT(VERBOSE) << "EP Context cache enabled: " << ep_ctx_enabled_; + LOGS_DEFAULT(VERBOSE) << "EP context cache embed mode: " << ep_ctx_embed_mode_; + LOGS_DEFAULT(VERBOSE) << "User specified EP context cache path: " << ep_ctx_model_path_cfg_; } void VitisAIExecutionProvider::CreateKernelRegistry() { @@ -30,9 +51,115 @@ void VitisAIExecutionProvider::CreateKernelRegistry() { std::shared_ptr VitisAIExecutionProvider::GetKernelRegistry() const { return get_kernel_registry_vitisaiep(); } +// This method is called after both `GetComputeCapabilityOps()` and `Compile()`. +// This timing is required to work with both compilation-based EPs and non-compilation-based EPs. +const InlinedVector VitisAIExecutionProvider::GetEpContextNodes() const { + InlinedVector ep_context_node_ptrs; + // All preconditions are supposed to have happened. + if (p_ep_ctx_model_) { + auto& graph = p_ep_ctx_model_->MainGraph(); + for (const auto* p_node : graph.Nodes()) { + ep_context_node_ptrs.push_back(p_node); + } + } + return ep_context_node_ptrs; +} + +void VitisAIExecutionProvider::LoadEPContexModelFromFile() const { + // XXX: should "p_ep_ctx_model_" be checked or not? + if (!p_ep_ctx_model_ && !ep_ctx_model_file_loc_.empty()) { + auto status = Model::Load(ep_ctx_model_file_loc_, *p_ep_ctx_model_proto_); + if (!status.IsOK()) { + ORT_THROW("Loading EP context model failed from ", PathToUTF8String(ep_ctx_model_file_loc_)); + } + p_ep_ctx_model_ = Model::Create(std::move(*p_ep_ctx_model_proto_), ep_ctx_model_file_loc_, nullptr, *GetLogger()); + LOGS_DEFAULT(VERBOSE) << "Loaded EP context model from: " << PathToUTF8String(ep_ctx_model_file_loc_); + } else if (ep_ctx_model_file_loc_.empty()) { + LOGS_DEFAULT(WARNING) << "Cannot load an EP-context model due to bad file path"; + } +} + +void VitisAIExecutionProvider::PrepareEPContextEnablement( + const onnxruntime::GraphViewer& graph_viewer) const { + if (model_path_str_.empty()) { + // TODO: platform dependency (Linux vs Windows). + model_path_str_ = ToPathString(GetTopLevelModelPath(graph_viewer).string()); + } + std::string backend_cache_dir, backend_cache_key; + get_backend_compilation_cache(model_path_str_, graph_viewer, info_, kXCCode, backend_cache_dir, backend_cache_key, backend_cache_data_); + info_["cacheDir"] = backend_cache_dir; + info_["cacheKey"] = backend_cache_key; + // Create a new model, reusing the graph name, the op-domain-to-opset-version map, + // the op schema registry of the current graph, etc. + p_ep_ctx_model_ = graph_viewer.CreateModel(*GetLogger()); + LOGS_DEFAULT(VERBOSE) << "Container model created"; +} + +void VitisAIExecutionProvider::FulfillEPContextEnablement( + const std::vector& fused_nodes_and_graphs) { + auto& ep_ctx_graph = p_ep_ctx_model_->MainGraph(); + if (!ep_ctx_embed_mode_) { + auto ep_ctx_cache_path_str = GetEPContextCacheFileLocation(ep_ctx_model_file_loc_, model_path_str_); + std::ofstream ep_ctx_cache_ofs(ep_ctx_cache_path_str.c_str(), std::ios::trunc); + if (!ep_ctx_cache_ofs.is_open()) { + ORT_THROW("Failed to open a file to write EP context cache: ", ep_ctx_cache_path_str.c_str()); + } + ep_ctx_cache_ofs.write(backend_cache_data_.c_str(), backend_cache_data_.length()); + if (!ep_ctx_cache_ofs.good()) { + ep_ctx_cache_ofs.close(); + ORT_THROW("Exception writing EP context cache file: ", ep_ctx_cache_path_str.c_str()); + } + ep_ctx_cache_ofs.close(); + CreateEPContexNodes(&ep_ctx_graph, fused_nodes_and_graphs, "", PathToUTF8String(ep_ctx_cache_path_str), 0, info_.at("cacheDir"), info_.at("cacheKey"), false, GetLogger()); + } else { + CreateEPContexNodes(&ep_ctx_graph, fused_nodes_and_graphs, backend_cache_data_, "", 1, info_["cacheDir"], info_["cacheKey"], false, GetLogger()); + } + if (GraphHasEPContextNode(ep_ctx_graph)) { + LOGS_DEFAULT(VERBOSE) << "Created model has EP context nodes"; + } else { + LOGS_DEFAULT(WARNING) << "No EP eontext nodes created"; + } +} + std::vector> VitisAIExecutionProvider::GetCapability( - const onnxruntime::GraphViewer& graph, const IKernelLookup& /*kernel_lookup*/) const { - if (graph.IsSubgraph()) { + const onnxruntime::GraphViewer& graph_viewer, const IKernelLookup& /*kernel_lookup*/) const { + bool is_ep_ctx_model = GraphHasEPContextNode(graph_viewer.GetGraph()); + // TODO: platform dependency (Linux vs Windows). + model_path_str_ = ToPathString(GetTopLevelModelPath(graph_viewer).string()); + if (GetEPContextModelFileLocation( + ep_ctx_model_path_cfg_, model_path_str_, is_ep_ctx_model, ep_ctx_model_file_loc_)) { + if (is_ep_ctx_model) { + LOGS_DEFAULT(VERBOSE) << "An EP context model passed in"; + ValidateEPContextNode(graph_viewer.GetGraph()); + std::string cache_dir, cache_key; + RetrieveBackendCacheInfo(graph_viewer.GetGraph(), cache_dir, cache_key); + info_["cacheDir"] = cache_dir; + info_["cacheKey"] = cache_key; + LOGS_DEFAULT(VERBOSE) << "Trying getting compilation cache from " << PathToUTF8String(ep_ctx_model_file_loc_); + auto ep_ctx_payload = RetrieveEPContextCache(graph_viewer.GetGraph(), ep_ctx_model_file_loc_, false); + restore_backend_compilation_cache(cache_dir, cache_key, ep_ctx_payload, graph_viewer.ModelPath().string()); + } else { + if (fs::exists(ep_ctx_model_file_loc_) && fs::is_regular_file(ep_ctx_model_file_loc_) && ep_ctx_enabled_) { + ORT_THROW("The inference session was created with a normal ONNX model but a model file with EP context cache exists at ", + PathToUTF8String(ep_ctx_model_file_loc_), ". Please remove the EP context model manually if you want to re-generate it."); + // Disable the flexibility implemented below by throwing an exception. + // Now the code below is unreachable but DCE will take care of it. + // We might want to re-enable it in future, so we keep it as is. + LoadEPContexModelFromFile(); + ValidateEPContextNode(p_ep_ctx_model_->MainGraph()); + std::string cache_dir, cache_key; + RetrieveBackendCacheInfo(p_ep_ctx_model_->MainGraph(), cache_dir, cache_key); + info_["cacheDir"] = cache_dir; + info_["cacheKey"] = cache_key; + auto ep_ctx_payload = RetrieveEPContextCache(p_ep_ctx_model_->MainGraph(), ep_ctx_model_file_loc_, false); + restore_backend_compilation_cache(cache_dir, cache_key, ep_ctx_payload, graph_viewer.ModelPath().string()); + } + } + } else { + LOGS_DEFAULT(WARNING) << "Failed to get EP context model file location"; + } + + if (graph_viewer.IsSubgraph()) { // VITIS AI EP not support sungraph. Assigned to CPU. return {}; } @@ -40,13 +167,16 @@ std::vector> VitisAIExecutionProvider::GetCap // Only compiling a model once is currently supported return {}; } - execution_providers_ = std::make_unique(compile_onnx_model(graph, *GetLogger(), info_)); - auto result = vaip::GetComputeCapabilityOps(graph, execution_providers_.get(), vitisai_optypes_); + execution_providers_ = std::make_unique(compile_onnx_model(graph_viewer, *GetLogger(), info_)); + auto result = vaip::GetComputeCapabilityOps(graph_viewer, execution_providers_.get(), vitisai_optypes_); size_t index = 0u; for (auto& ep : **execution_providers_) { - result.emplace_back(vaip::XirSubgraphToComputeCapability1(graph, ep.get(), index)); + result.emplace_back(vaip::XirSubgraphToComputeCapability1(graph_viewer, ep.get(), index)); index = index + 1; } + if (ep_ctx_enabled_ && !is_ep_ctx_model) { + PrepareEPContextEnablement(graph_viewer); + } return result; } @@ -74,6 +204,10 @@ common::Status VitisAIExecutionProvider::Compile(const std::vector #include #include #include #include +// 1st-party headers/libs. +// #include "core/framework/session_options.h" #include "core/providers/shared_library/provider_api.h" #include "core/session/onnxruntime_c_api.h" +#include "core/common/inlined_containers_fwd.h" // we cannot include vaip/vaip.hpp here because header file referred by // onnxruntime_pybind_state_common.cc @@ -24,9 +28,11 @@ namespace onnxruntime { class VitisAIExecutionProvider : public IExecutionProvider { public: explicit VitisAIExecutionProvider(const ProviderOptions& info); + // explicit VitisAIExecutionProvider(const ProviderOptions& info, + // const SessionOptions* p_sess_opts = nullptr); ~VitisAIExecutionProvider() = default; - std::vector> GetCapability(const onnxruntime::GraphViewer& graph, + std::vector> GetCapability(const onnxruntime::GraphViewer& graph_viewer, const IKernelLookup& /*kernel_lookup*/) const override; int GetDeviceId() const { return 0; } @@ -35,16 +41,34 @@ class VitisAIExecutionProvider : public IExecutionProvider { std::vector& node_compute_funcs) override; std::shared_ptr GetKernelRegistry() const override; + // This method is called after both `GetComputeCapabilityOps()` and `Compile()`. + // This timing is required to work with both compliation-based EPs and non-compilation-based EPs. + const InlinedVector GetEpContextNodes() const override; + private: void CreateKernelRegistry(); using my_ep_t = vaip_core::DllSafe>>; using my_ep_uptr_t = std::shared_ptr; // we have to hide the implementation by forward declaration. mutable my_ep_uptr_t execution_providers_; - ProviderOptions info_; + mutable ProviderOptions info_; std::vector custom_op_domains_; std::shared_ptr registry_; std::set vitisai_optypes_; + // EP context related. + bool ep_ctx_enabled_ = false; + bool ep_ctx_embed_mode_ = true; + std::string ep_ctx_model_path_cfg_{""}; + mutable std::string backend_cache_data_{""}; + mutable PathString model_path_str_{}; + mutable PathString ep_ctx_model_file_loc_{}; + mutable std::unique_ptr p_ep_ctx_model_; + mutable std::unique_ptr p_ep_ctx_model_proto_; + // It might need to be called before loading + // the EP context model that is compiled AOT/offline. + void LoadEPContexModelFromFile() const; + void PrepareEPContextEnablement(const onnxruntime::GraphViewer&) const; + void FulfillEPContextEnablement(const std::vector&); }; } // namespace onnxruntime diff --git a/onnxruntime/core/providers/vitisai/vitisai_provider_factory.cc b/onnxruntime/core/providers/vitisai/vitisai_provider_factory.cc old mode 100755 new mode 100644 diff --git a/onnxruntime/core/session/inference_session.cc b/onnxruntime/core/session/inference_session.cc index 3ef6490a56ded..f0eed91d70440 100644 --- a/onnxruntime/core/session/inference_session.cc +++ b/onnxruntime/core/session/inference_session.cc @@ -881,8 +881,6 @@ common::Status InferenceSession::RegisterGraphTransformer( } common::Status InferenceSession::SaveToOrtFormat(const std::filesystem::path& filepath) const { - ORT_RETURN_IF_NOT(FLATBUFFERS_LITTLEENDIAN, "ort format only supports little-endian machines"); - // Get the byte size of the ModelProto and round it to the next MB and use it as flatbuffers' init_size // TODO: Investigate whether we should set a max size, and clarify the cost of having a buffer smaller than // what the total flatbuffers serialized size will be. @@ -1390,8 +1388,6 @@ Status InferenceSession::LoadOrtModel(const void* model_data, int model_data_len } Status InferenceSession::LoadOrtModelWithLoader(std::function load_ort_format_model_bytes) { - static_assert(FLATBUFFERS_LITTLEENDIAN, "ORT format only supports little-endian machines"); - std::lock_guard l(session_mutex_); if (is_model_loaded_) { // already loaded diff --git a/onnxruntime/core/session/provider_bridge_ort.cc b/onnxruntime/core/session/provider_bridge_ort.cc index b53e70926cd5d..4f9669a7dcc4c 100644 --- a/onnxruntime/core/session/provider_bridge_ort.cc +++ b/onnxruntime/core/session/provider_bridge_ort.cc @@ -28,6 +28,7 @@ #include "core/session/inference_session.h" #include "core/session/abi_session_options_impl.h" #include "core/session/ort_apis.h" +#include "core/session/onnxruntime_session_options_config_keys.h" #include "core/session/provider_bridge_ort.h" #include "core/util/math.h" #include "core/framework/sparse_utils.h" @@ -68,10 +69,12 @@ using StringStringEntryProtos = google::protobuf::RepeatedPtrField; using TensorShapeProto_Dimensions = google::protobuf::RepeatedPtrField; using ValueInfoProtos = google::protobuf::RepeatedPtrField; +using FunctionProtos = google::protobuf::RepeatedPtrField; } // namespace ONNX_NAMESPACE namespace onnxruntime { using IndexedSubGraph_MetaDef = IndexedSubGraph::MetaDef; +using IndexedSubGraph_SourceOfSchema = IndexedSubGraph::SourceOfSchema; } // namespace onnxruntime #include "core/common/cpuid_info.h" @@ -132,6 +135,8 @@ ProviderInfo_Dnnl& GetProviderInfo_Dnnl(); ProviderInfo_ROCM* TryGetProviderInfo_ROCM(); ProviderInfo_ROCM& GetProviderInfo_ROCM(); ProviderHostCPU& GetProviderHostCPU(); +ProviderInfo_MIGraphX* TryGetProviderInfo_MIGraphX(); +ProviderInfo_MIGraphX& GetProviderInfo_MIGraphX(); ONNX_NAMESPACE::OpSchema CreateSchema(const std::string& domain, const std::vector& ops); struct TensorShapeProto_Dimension_Iterator_Impl : TensorShapeProto_Dimension_Iterator { TensorShapeProto_Dimension_Iterator_Impl(google::protobuf::internal::RepeatedPtrIterator&& v) : v_{std::move(v)} {} @@ -243,6 +248,11 @@ struct ProviderHostImpl : ProviderHost { void CudaCall_true(int retCode, const char* exprString, const char* libName, int successCode, const char* msg, const char* file, const int line) override { GetProviderInfo_CUDA().CudaCall_true(retCode, exprString, libName, successCode, msg, file, line); } #endif +#ifdef USE_MIGRAPHX + std::unique_ptr CreateMIGraphXAllocator(int16_t device_id, const char* name) override { return GetProviderInfo_MIGraphX().CreateMIGraphXAllocator(device_id, name); } + std::unique_ptr CreateMIGraphXPinnedAllocator(int16_t device_id, const char* name) override { return GetProviderInfo_MIGraphX().CreateMIGraphXPinnedAllocator(device_id, name); } +#endif + #ifdef USE_ROCM std::unique_ptr CreateROCMAllocator(int16_t device_id, const char* name) override { return GetProviderInfo_ROCM().CreateROCMAllocator(device_id, name); } std::unique_ptr CreateROCMPinnedAllocator(const char* name) override { return GetProviderInfo_ROCM().CreateROCMPinnedAllocator(name); } @@ -393,6 +403,11 @@ struct ProviderHostImpl : ProviderHost { int StringStringEntryProtos__size(ONNX_NAMESPACE::StringStringEntryProtos* p) override { return p->size(); } ONNX_NAMESPACE::StringStringEntryProto& StringStringEntryProtos__at(ONNX_NAMESPACE::StringStringEntryProtos* p, int index) override { return p->at(index); }; + // OperatorSetIdProto + std::string* OperatorSetIdProto__mutable_domain(ONNX_NAMESPACE::OperatorSetIdProto* p) override { return p->mutable_domain(); } + void OperatorSetIdProto__set_version(ONNX_NAMESPACE::OperatorSetIdProto* p, int64_t version) override { return p->set_version(version); } + int64_t OperatorSetIdProto__version(const ONNX_NAMESPACE::OperatorSetIdProto* p) override { return p->version(); } + #if !defined(DISABLE_OPTIONAL_TYPE) // TypeProto_Optional (wrapped) const ONNX_NAMESPACE::TypeProto& TypeProto_Optional__elem_type(const ONNX_NAMESPACE::TypeProto_Optional* p) override { return p->elem_type(); } @@ -521,6 +536,11 @@ struct ProviderHostImpl : ProviderHost { void ModelProto__set_ir_version(ONNX_NAMESPACE::ModelProto* p, int64_t value) override { p->set_ir_version(value); } ONNX_NAMESPACE::StringStringEntryProtos* ModelProto__mutable_metadata_props(ONNX_NAMESPACE::ModelProto* p) override { return p->mutable_metadata_props(); }; + const ONNX_NAMESPACE::OperatorSetIdProto& ModelProto__opset_import(const ONNX_NAMESPACE::ModelProto* p, int index) override { return p->opset_import(index); } + ONNX_NAMESPACE::OperatorSetIdProto* ModelProto__mutable_opset_import(ONNX_NAMESPACE::ModelProto* p, int index) override { return p->mutable_opset_import(index); } + int ModelProto__opset_import_size(const ONNX_NAMESPACE::ModelProto* p) override { return p->opset_import_size(); } + ONNX_NAMESPACE::OperatorSetIdProto* ModelProto__add_opset_import(ONNX_NAMESPACE::ModelProto* p) override { return p->add_opset_import(); } + // NodeProto (wrapped) std::unique_ptr NodeProto__construct() override { return std::make_unique(); } void NodeProto__operator_delete(ONNX_NAMESPACE::NodeProto* p) override { delete p; } @@ -528,6 +548,7 @@ struct ProviderHostImpl : ProviderHost { int NodeProto__attribute_size(ONNX_NAMESPACE::NodeProto* p) override { return p->attribute_size(); } const ONNX_NAMESPACE::AttributeProto& NodeProto__attribute(const ONNX_NAMESPACE::NodeProto* p, int index) const override { return p->attribute(index); } ONNX_NAMESPACE::AttributeProto* NodeProto__mutable_attribute(ONNX_NAMESPACE::NodeProto* p, int index) override { return p->mutable_attribute(index); } + ONNX_NAMESPACE::AttributeProto* NodeProto__add_attribute(ONNX_NAMESPACE::NodeProto* p) override { return p->add_attribute(); } // TensorProto (wrapped) std::unique_ptr TensorProto__construct() override { return std::make_unique(); } @@ -602,6 +623,64 @@ struct ProviderHostImpl : ProviderHost { const ONNX_NAMESPACE::ValueInfoProto& ValueInfoProtos__operator_array(const ONNX_NAMESPACE::ValueInfoProtos* p, int index) override { return (*p)[index]; } + // FunctionProto (wrapped) + std::unique_ptr FunctionProto__construct() override { return std::make_unique(); } + void FunctionProto__operator_delete(ONNX_NAMESPACE::FunctionProto* p) override { delete p; } + + bool FunctionProto__SerializeToString(const ONNX_NAMESPACE::FunctionProto* p, std::string& string) override { return p->SerializeToString(&string); } + bool FunctionProto__SerializeToOstream(const ONNX_NAMESPACE::FunctionProto* p, std::ostream& output) override { return p->SerializeToOstream(&output); } + bool FunctionProto__ParseFromString(ONNX_NAMESPACE::FunctionProto* p, const std::string& data) override { return p->ParseFromString(data); } + std::string FunctionProto__SerializeAsString(const ONNX_NAMESPACE::FunctionProto* p) override { return p->SerializeAsString(); } + + bool FunctionProto__has_name(const ONNX_NAMESPACE::FunctionProto* p) override { return p->has_name(); } + const std::string& FunctionProto__name(const ONNX_NAMESPACE::FunctionProto* p) const override { return p->name(); } + void FunctionProto__set_name(ONNX_NAMESPACE::FunctionProto* p, const std::string& name) override { p->set_name(name); } + + bool FunctionProto__has_doc_string(const ONNX_NAMESPACE::FunctionProto* p) override { return p->has_doc_string(); } + const std::string& FunctionProto__doc_string(const ONNX_NAMESPACE::FunctionProto* p) const override { return p->doc_string(); } + void FunctionProto__set_doc_string(ONNX_NAMESPACE::FunctionProto* p, const std::string& doc_string) override { p->set_doc_string(doc_string); } + + bool FunctionProto__has_domain(const ONNX_NAMESPACE::FunctionProto* p) override { return p->has_domain(); } + const std::string& FunctionProto__domain(const ONNX_NAMESPACE::FunctionProto* p) const override { return p->domain(); } + void FunctionProto__set_domain(ONNX_NAMESPACE::FunctionProto* p, const std::string& domain) override { p->set_domain(domain); } + + const std::string& FunctionProto__input(const ONNX_NAMESPACE::FunctionProto* p, int index) override { return p->input(index); } + std::string* FunctionProto__mutable_input(ONNX_NAMESPACE::FunctionProto* p, int index) override { return p->mutable_input(index); } + int FunctionProto__input_size(const ONNX_NAMESPACE::FunctionProto* p) override { return p->input_size(); } + void FunctionProto__add_input(ONNX_NAMESPACE::FunctionProto* p, const std::string& value) override { p->add_input(value); } + + const std::string& FunctionProto__output(const ONNX_NAMESPACE::FunctionProto* p, int index) override { return p->output(index); } + std::string* FunctionProto__mutable_output(ONNX_NAMESPACE::FunctionProto* p, int index) override { return p->mutable_output(index); } + int FunctionProto__output_size(const ONNX_NAMESPACE::FunctionProto* p) override { return p->output_size(); } + void FunctionProto__add_output(ONNX_NAMESPACE::FunctionProto* p, const std::string& value) override { p->add_output(value); } + + const std::string& FunctionProto__attribute(const ONNX_NAMESPACE::FunctionProto* p, int index) override { return p->attribute(index); } + std::string* FunctionProto__mutable_attribute(ONNX_NAMESPACE::FunctionProto* p, int index) override { return p->mutable_attribute(index); } + int FunctionProto__attribute_size(const ONNX_NAMESPACE::FunctionProto* p) override { return p->attribute_size(); } + void FunctionProto__add_attribute(ONNX_NAMESPACE::FunctionProto* p, const std::string& value) override { p->add_attribute(value); } + + const ONNX_NAMESPACE::AttributeProto& FunctionProto__attribute_proto(const ONNX_NAMESPACE::FunctionProto* p, int index) override { return p->attribute_proto(index); } + ONNX_NAMESPACE::AttributeProto* FunctionProto__mutable_attribute_proto(ONNX_NAMESPACE::FunctionProto* p, int index) override { return p->mutable_attribute_proto(index); } + int FunctionProto__attribute_proto_size(const ONNX_NAMESPACE::FunctionProto* p) override { return p->attribute_proto_size(); } + ONNX_NAMESPACE::AttributeProto* FunctionProto__add_attribute_proto(ONNX_NAMESPACE::FunctionProto* p) override { return p->add_attribute_proto(); } + + const ONNX_NAMESPACE::NodeProto& FunctionProto__node(const ONNX_NAMESPACE::FunctionProto* p, int index) override { return p->node(index); } + ONNX_NAMESPACE::NodeProto* FunctionProto__mutable_node(ONNX_NAMESPACE::FunctionProto* p, int index) override { return p->mutable_node(index); } + int FunctionProto__node_size(const ONNX_NAMESPACE::FunctionProto* p) override { return p->node_size(); } + ONNX_NAMESPACE::NodeProto* FunctionProto__add_node(ONNX_NAMESPACE::FunctionProto* p) override { return p->add_node(); } + + const ONNX_NAMESPACE::ValueInfoProto& FunctionProto__value_info(const ONNX_NAMESPACE::FunctionProto* p, int index) override { return p->value_info(index); } + ONNX_NAMESPACE::ValueInfoProto* FunctionProto__mutable_value_info(ONNX_NAMESPACE::FunctionProto* p, int index) override { return p->mutable_value_info(index); } + ONNX_NAMESPACE::ValueInfoProtos* FunctionProto__mutable_value_info(ONNX_NAMESPACE::FunctionProto* p) override { return p->mutable_value_info(); } + int FunctionProto__value_info_size(const ONNX_NAMESPACE::FunctionProto* p) override { return p->value_info_size(); } + ONNX_NAMESPACE::ValueInfoProto* FunctionProto__add_value_info(ONNX_NAMESPACE::FunctionProto* p) override { return p->add_value_info(); } + + const ONNX_NAMESPACE::StringStringEntryProto& FunctionProto__metadata_props(const ONNX_NAMESPACE::FunctionProto* p, int index) override { return p->metadata_props(index); } + ONNX_NAMESPACE::StringStringEntryProto* FunctionProto__mutable_metadata_props(ONNX_NAMESPACE::FunctionProto* p, int index) override { return p->mutable_metadata_props(index); } + ONNX_NAMESPACE::StringStringEntryProtos* FunctionProto__mutable_metadata_props(ONNX_NAMESPACE::FunctionProto* p) override { return p->mutable_metadata_props(); } + int FunctionProto__metadata_props_size(const ONNX_NAMESPACE::FunctionProto* p) override { return p->metadata_props_size(); } + ONNX_NAMESPACE::StringStringEntryProto* FunctionProto__add_metadata_props(ONNX_NAMESPACE::FunctionProto* p) override { return p->add_metadata_props(); } + static int32_t convert_elem_type(const ONNX_NAMESPACE::AttributeProto* data_type) { int32_t elemType = 0; if (data_type->s() == "float32") { @@ -784,9 +863,12 @@ struct ProviderHostImpl : ProviderHost { std::vector& IndexedSubGraph__Nodes(IndexedSubGraph* p) override { return p->nodes; } - void IndexedSubGraph__SetMetaDef(IndexedSubGraph* p, std::unique_ptr&& meta_def_) override { return p->SetMetaDef(std::move(meta_def_)); } + void IndexedSubGraph__SetMetaDef(IndexedSubGraph* p, std::unique_ptr&& meta_def_) override { p->SetMetaDef(std::move(meta_def_)); } const IndexedSubGraph_MetaDef* IndexedSubGraph__GetMetaDef(const IndexedSubGraph* p) override { return p->GetMetaDef(); } + void IndexedSubGraph__SetSchemaSource(IndexedSubGraph* p, IndexedSubGraph_SourceOfSchema schema_source) override { p->schema_source = schema_source; } + IndexedSubGraph_SourceOfSchema IndexedSubGraph__GetSchemaSource(const IndexedSubGraph* p) override { return p->schema_source; } + // KernelDef (wrapped) void KernelDef__operator_delete(KernelDef* p) override { delete p; } void KernelDef__SinceVersion(const KernelDef* p, int* start, int* end) override { return p->SinceVersion(start, end); } @@ -1954,6 +2036,20 @@ ProviderInfo_ROCM& GetProviderInfo_ROCM() { ORT_THROW("ROCM Provider not available, can't get interface for it"); } +ProviderInfo_MIGraphX* TryGetProviderInfo_MIGraphX() try { + return reinterpret_cast(s_library_migraphx.Get().GetInfo()); +} catch (const std::exception& exception) { + LOGS_DEFAULT(ERROR) << exception.what(); + return nullptr; +} + +ProviderInfo_MIGraphX& GetProviderInfo_MIGraphX() { + if (auto* info = TryGetProviderInfo_MIGraphX()) + return *info; + + ORT_THROW("MIGraphX Provider not available, can't get interface for it"); +} + void CopyGpuToCpu( void* dst_ptr, const void* src_ptr, @@ -2821,6 +2917,11 @@ ORT_API_STATUS_IMPL(OrtApis::SessionOptionsAppendExecutionProvider_VitisAI, _In_ provider_options[provider_options_keys[i]] = provider_options_values[i]; } + // EP context related session config options. + provider_options["ep_context_enable"] = options->value.config_options.GetConfigOrDefault(kOrtSessionOptionEpContextEnable, "0"); + provider_options["ep_context_embed_mode"] = options->value.config_options.GetConfigOrDefault(kOrtSessionOptionEpContextEmbedMode, "1"); + provider_options["ep_context_file_path"] = options->value.config_options.GetConfigOrDefault(kOrtSessionOptionEpContextFilePath, ""); + auto factory = onnxruntime::VitisAIProviderFactoryCreator::Create(provider_options); if (!factory) { return OrtApis::CreateStatus(ORT_FAIL, "SessionOptionsAppendExecutionProvider_VitisAI: Failed to load shared library"); diff --git a/onnxruntime/python/onnxruntime_pybind_quant.cc b/onnxruntime/python/onnxruntime_pybind_quant.cc index 5e8e5c1a2a2fc..51a52af1b151e 100644 --- a/onnxruntime/python/onnxruntime_pybind_quant.cc +++ b/onnxruntime/python/onnxruntime_pybind_quant.cc @@ -67,7 +67,7 @@ void QuantizeMatMul4BitsBlockwise( } template -void QuantizeQDQMatMul4BitsBlockwise( +bool QuantizeQDQMatMul4BitsBlockwise( py::array_t dst, // shape: [K, N / 2] py::array_t src, // shape: [K, N] py::array_t scale, // shape: [block_per_K, N] @@ -85,7 +85,7 @@ void QuantizeQDQMatMul4BitsBlockwise( py::buffer_info scale_buf = scale.request(); py::buffer_info zp_buf = zero_points.request(); - MlasQDQQuantizeBlockwise( + return MlasQDQQuantizeBlockwise( reinterpret_cast(src_buf.ptr), reinterpret_cast(scale_buf.ptr), is_symmetric ? nullptr : reinterpret_cast(zp_buf.ptr), diff --git a/onnxruntime/python/onnxruntime_pybind_state.cc b/onnxruntime/python/onnxruntime_pybind_state.cc index e539614fd6d1d..e13285c60e69f 100644 --- a/onnxruntime/python/onnxruntime_pybind_state.cc +++ b/onnxruntime/python/onnxruntime_pybind_state.cc @@ -1114,6 +1114,9 @@ std::unique_ptr CreateExecutionProviderInstance( if (it != provider_options_map.end()) { info = it->second; } + info["ep_context_enable"] = session_options.config_options.GetConfigOrDefault(kOrtSessionOptionEpContextEnable, "0"); + info["ep_context_embed_mode"] = session_options.config_options.GetConfigOrDefault(kOrtSessionOptionEpContextEmbedMode, "1"); + info["ep_context_file_path"] = session_options.config_options.GetConfigOrDefault(kOrtSessionOptionEpContextFilePath, ""); return onnxruntime::VitisAIProviderFactoryCreator::Create(info)->CreateProvider(); #endif } else if (type == kAclExecutionProvider) { diff --git a/onnxruntime/python/tools/quantization/matmul_4bits_quantizer.py b/onnxruntime/python/tools/quantization/matmul_4bits_quantizer.py index 11a830dc6d7f5..40a4a4d26dc1c 100644 --- a/onnxruntime/python/tools/quantization/matmul_4bits_quantizer.py +++ b/onnxruntime/python/tools/quantization/matmul_4bits_quantizer.py @@ -18,31 +18,36 @@ from onnx.onnx_pb import GraphProto, ModelProto, NodeProto, TensorProto from packaging import version -from onnxruntime.capi._pybind_state import quantize_matmul_4bits +from onnxruntime.capi._pybind_state import quantize_matmul_4bits, quantize_qdq_matmul_4bits from .calibrate import CalibrationDataReader from .onnx_model import ONNXModel -from .quant_utils import attribute_to_kwarg +from .quant_utils import QuantFormat, attribute_to_kwarg logging.basicConfig(format="%(asctime)s %(name)s [%(levelname)s] - %(message)s", level=logging.INFO) logger = logging.getLogger(__name__) class WeightOnlyQuantConfig: - def __init__(self, algorithm): + def __init__(self, algorithm, quant_format): """This is the Base class for Weight Only Quant Configuration. Args: algorithm: weight only quantize algorithm name. + quant_format: QuantFormat{QOperator, QDQ}. + QOperator format quantizes the model with quantized operators directly. + QDQ format quantize the model by inserting QuantizeLinear/DeQuantizeLinear on the tensor. """ self.algorithm = algorithm + self.quant_format = quant_format class RTNWeightOnlyQuantConfig(WeightOnlyQuantConfig): def __init__( self, ratios=None, + quant_format=QuantFormat.QOperator, ): """ This is a class for round-to-nearest (RTN) algorithm Weight Only Quant Configuration. @@ -51,11 +56,18 @@ def __init__( Args: ratios: percentile of clip. Defaults to {}. + quant_format (QuantFormat{QOperator, QDQ}, optional): + QOperator format quantizes the model with quantized operators directly. + QDQ format quantize the model by inserting QuantizeLinear/DeQuantizeLinear on the tensor. + Defaults to QuantFormat.QOperator. """ + assert quant_format == QuantFormat.QOperator, "RTN only supports QOperator format" + if ratios is None: ratios = {} super().__init__( algorithm="RTN", + quant_format=quant_format, ) self.ratios = ratios @@ -69,6 +81,7 @@ def __init__( actorder=False, mse=False, perchannel=True, + quant_format=QuantFormat.QOperator, ): """ This is a class for GPTQ algorithm Weight Only Quant Configuration. @@ -87,9 +100,16 @@ def __init__( whether get scale and zero point with mse error. perchannel (bool, optional): whether quantize weight per-channel. + quant_format (QuantFormat{QOperator, QDQ}, optional): + QOperator format quantizes the model with quantized operators directly. + QDQ format quantize the model by inserting QuantizeLinear/DeQuantizeLinear on the tensor. + Defaults to QuantFormat.QOperator. """ + assert quant_format == QuantFormat.QOperator, "GPTQ only supports QOperator format" + super().__init__( algorithm="GPTQ", + quant_format=quant_format, ) self.calibration_data_reader = calibration_data_reader self.percdamp = percdamp @@ -105,6 +125,7 @@ def __init__( block_size=128, bits=4, axis=1, + quant_format=QuantFormat.QOperator, ): """ This is a class for HQQ algorithm Weight Only Quant Configuration. @@ -112,14 +133,21 @@ def __init__( Args: block_size (int, optional): - channel number in one block to execute a GPTQ quantization iteration. + channel number in one block to execute a HQQ quantization iteration. bits (int, optional): how many bits to represent weight. axis (int, optional): 0 or 1. which axis to quantize. https://arxiv.org/pdf/2309.15531.pdf + quant_format (QuantFormat{QOperator, QDQ}, optional): + QOperator format quantizes the model with quantized operators directly. + QDQ format quantize the model by inserting QuantizeLinear/DeQuantizeLinear on the tensor. + Defaults to QuantFormat.QOperator. """ + assert quant_format == QuantFormat.QOperator, "HQQ only supports QOperator format" + super().__init__( algorithm="HQQ", + quant_format=quant_format, ) self.block_size = block_size self.bits = bits @@ -132,8 +160,26 @@ def __init__( block_size: int = 128, is_symmetric: bool = False, accuracy_level: int | None = None, + quant_format=QuantFormat.QOperator, ): - super().__init__(algorithm="DEFAULT") + """ + This is a class for weight only affine quantization configuration. + + Args: + block_size (int, optional): + channel number in one block to execute an affine quantization iteration. + is_symmetric (bool, optional): + whether quantize weight symmetrically. + accuracy_level (int, optional): + Accuracy level of the 4-bit quantized MatMul computation. + Refer to the MatMulNBits contrib op's 'accuracy_level' attribute for details. + (https://github.com/microsoft/onnxruntime/blob/main/docs/ContribOperators.md#commicrosoftmatmulnbits) + quant_format (QuantFormat{QOperator, QDQ}, optional): + QOperator format quantizes the model with quantized operators directly. + QDQ format quantize the model by inserting QuantizeLinear/DeQuantizeLinear on the tensor. + Defaults to QuantFormat.QOperator. + """ + super().__init__(algorithm="DEFAULT", quant_format=quant_format) self.block_size = block_size self.is_symmetric = is_symmetric self.bits = 4 @@ -287,23 +333,26 @@ def quantize_internal( return w_q, scale.to(tensor.dtype), zero.to(tensor.dtype) - def quantize(self, node: NodeProto, graph_stack: list[GraphProto]): - """If the node is MatMul with fp32 const weight, quantize the weight with int4, and return the new node""" + def quantize(self, node: NodeProto, graph_stack: list[GraphProto]) -> list[NodeProto]: + """ + If the node is MatMul with fp32 const weight, quantize the weight with int4, and return the new node. + If QOperator format, return MatMulNbits. If QDQ format, return DeQuantizeLinear + MatMul. + """ if node.op_type != "MatMul": - return node # only care about MatMul for now + return [node] # only care about MatMul for now import torch logger.info(f"start to quantize {node.name} ...") - inputB = node.input[1] # noqa: N806 - b_pb, bs_graph = get_initializer(inputB, graph_stack) + input_b = node.input[1] + b_pb, bs_graph = get_initializer(input_b, graph_stack) if b_pb is None: logger.info("MatMul doesn't have const weight. Skip to quantize") - return node # only care about constant weight + return [node] # only care about constant weight b_array = onnx.numpy_helper.to_array(b_pb) if len(b_array.shape) != 2: logger.info("MatMul weight is not 2D. Skip to quantize") - return node # can only process 2-D matrix + return [node] # can only process 2-D matrix b_array_torch = torch.from_numpy(b_array) if torch.cuda.is_available(): b_array_torch = b_array_torch.cuda() @@ -334,7 +383,7 @@ def quantize(self, node: NodeProto, graph_stack: list[GraphProto]): b_quant = onnx.numpy_helper.from_array(packed_torch.cpu().numpy()) b_quant.name = b_pb.name + "_Q4" for input in bs_graph.input: - if input.name == inputB: + if input.name == input_b: bs_graph.input.remove(input) break @@ -366,7 +415,7 @@ def quantize(self, node: NodeProto, graph_stack: list[GraphProto]): logger.info(f"complete quantization of {node.name} ...") - return matmul_q4_node + return [matmul_q4_node] def get_initializer(name, graph_path: list[GraphProto]) -> tuple[TensorProto, GraphProto]: @@ -382,7 +431,7 @@ class DefaultWeightOnlyQuantizer: def __init__(self, config: DefaultWeightOnlyQuantConfig): self.config = config - def int4_block_quant(self, fp32weight: npt.ArrayLike) -> np.ndarray: + def int4_block_quant(self, fp32weight: npt.ArrayLike) -> tuple[np.ndarray, np.ndarray, np.ndarray]: """4b quantize fp32 weight to a blob""" if len(fp32weight.shape) != 2: @@ -390,83 +439,136 @@ def int4_block_quant(self, fp32weight: npt.ArrayLike) -> np.ndarray: rows, cols = fp32weight.shape block_size = self.config.block_size - blob_size = block_size // 2 k_blocks = (rows + block_size - 1) // block_size - padded_rows = k_blocks * block_size - pad_len = padded_rows - rows - if pad_len > 0: - fp32weight = np.pad(fp32weight, ((0, pad_len), (0, 0)), "constant") - # block wise quantization, each block comes from a single column - packed = np.zeros((cols, k_blocks, blob_size), dtype="uint8") - scales = np.zeros((cols * k_blocks), dtype=fp32weight.dtype) - zero_point = np.zeros(cols * ((k_blocks + 1) // 2), dtype="uint8") - quantize_matmul_4bits(packed, fp32weight, scales, zero_point, block_size, cols, rows, self.config.is_symmetric) + if self.config.quant_format == QuantFormat.QOperator: + blob_size = block_size // 2 + padded_rows = k_blocks * block_size + pad_len = padded_rows - rows + if pad_len > 0: + fp32weight = np.pad(fp32weight, ((0, pad_len), (0, 0)), "constant") + + # block wise quantization, each block comes from a single column + packed = np.zeros((cols, k_blocks, blob_size), dtype="uint8") + zero_point = np.zeros(cols * ((k_blocks + 1) // 2), dtype="uint8") + scales = np.zeros((cols * k_blocks), dtype=fp32weight.dtype) + quantize_matmul_4bits( + packed, fp32weight, scales, zero_point, block_size, cols, rows, self.config.is_symmetric + ) + else: + packed = np.zeros((rows * cols + 1) // 2, dtype="uint8") + zero_point = np.zeros((cols * k_blocks + 1) // 2, dtype="uint8") + scales = np.zeros((k_blocks, cols), dtype=fp32weight.dtype) + quantize_qdq_matmul_4bits( + packed, fp32weight, scales, zero_point, block_size, cols, rows, self.config.is_symmetric + ) return (packed, scales, zero_point) - def quantize(self, node: NodeProto, graph_stack: list[GraphProto]) -> NodeProto: - """If the node is MatMul with fp32 const weight, quantize the weight with int4, and return the new node""" + def quantize(self, node: NodeProto, graph_stack: list[GraphProto]) -> list[NodeProto]: + """ + If the node is MatMul with fp32 const weight, quantize the weight with int4, and return the new node. + If QOperator format, return MatMulNbits. If QDQ format, return DeQuantizeLinear + MatMul. + """ if node.op_type != "MatMul": - return node # only care about MatMul for now + return [node] # only care about MatMul for now logger.info(f"start to quantize {node.name} ...") - inputB = node.input[1] # noqa: N806 - B, Bs_graph = get_initializer(inputB, graph_stack) # noqa: N806 - if B is None: + qtype = TensorProto.INT4 if self.config.is_symmetric else TensorProto.UINT4 + input_b = node.input[1] + b_tensor, b_graph = get_initializer(input_b, graph_stack) + if b_tensor is None: logger.info("MatMul doesn't have const weight. Skip to quantize") - return node # only care about constant weight + return [node] # only care about constant weight - B_array = onnx.numpy_helper.to_array(B) # noqa: N806 - if len(B_array.shape) != 2: + b_ndarray = onnx.numpy_helper.to_array(b_tensor) + if len(b_ndarray.shape) != 2: logger.info("MatMul weight is not 2D. Skip to quantize") - return node # can only process 2-D matrix - - packed, scales, zero_points = self.int4_block_quant(B_array) - B_quant = onnx.numpy_helper.from_array(packed) # noqa: N806 - B_quant.name = B.name + "_Q4" - for input in Bs_graph.input: - if input.name == inputB: - Bs_graph.input.remove(input) - break + return [node] # can only process 2-D matrix - scales_tensor = onnx.numpy_helper.from_array(scales) - scales_tensor.name = B.name + "_scales" - Bs_graph.initializer.extend([B_quant, scales_tensor]) + packed, scales, zero_points = self.int4_block_quant(b_ndarray) - input_names = [node.input[0], B_quant.name, scales_tensor.name] - if not self.config.is_symmetric: - zp_tensor = onnx.numpy_helper.from_array(zero_points) - zp_tensor.name = B.name + "_zero_points" - Bs_graph.initializer.extend([zp_tensor]) - input_names.append(zp_tensor.name) + if self.config.quant_format == QuantFormat.QOperator: + b_quant = onnx.numpy_helper.from_array(packed, b_tensor.name + "_Q4") + scales_tensor = onnx.numpy_helper.from_array(scales, b_tensor.name + "_scales") + else: + b_quant = onnx.helper.make_tensor(b_tensor.name + "_DQ_Q4", qtype, b_ndarray.shape, packed.tobytes(), True) + scales_tensor = onnx.numpy_helper.from_array(scales, b_tensor.name + "_DQ_scales") - kwargs = {} - rows, cols = B_array.shape - kwargs["K"] = rows - kwargs["N"] = cols - kwargs["bits"] = 4 - kwargs["block_size"] = self.config.block_size - if self.config.accuracy_level is not None: - kwargs["accuracy_level"] = self.config.accuracy_level + for input in b_graph.input: + if input.name == input_b: + b_graph.input.remove(input) + break - matmul_q4_node = onnx.helper.make_node( - "MatMulNBits", - inputs=input_names, - outputs=[node.output[0]], - name=node.name + "_Q4" if node.name else "", - domain="com.microsoft", - **kwargs, - ) + b_graph.initializer.extend([b_quant, scales_tensor]) + + output_nodes = [] + + if self.config.quant_format == QuantFormat.QOperator: + input_names = [node.input[0], b_quant.name, scales_tensor.name] + if not self.config.is_symmetric: + zp_tensor = onnx.numpy_helper.from_array(zero_points, b_tensor.name + "_zero_points") + input_names.append(zp_tensor.name) + b_graph.initializer.extend([zp_tensor]) + kwargs = {} + rows, cols = b_ndarray.shape + kwargs["K"] = rows + kwargs["N"] = cols + kwargs["bits"] = 4 + kwargs["block_size"] = self.config.block_size + if self.config.accuracy_level is not None: + kwargs["accuracy_level"] = self.config.accuracy_level + + matmul_q4_node = onnx.helper.make_node( + "MatMulNBits", + inputs=input_names, + outputs=[node.output[0]], + name=node.name + "_Q4" if node.name else "", + domain="com.microsoft", + **kwargs, + ) - logger.info(f"complete quantization of {node.name} ...") + output_nodes.append(matmul_q4_node) + else: + dq_input_names = [b_quant.name, scales_tensor.name] + dq_output_names = [b_quant.name + "_output"] + matmul_input_names = [node.input[0], dq_output_names[0]] + matmul_output_names = [node.output[0]] + if not self.config.is_symmetric: + zp_tensor = onnx.helper.make_tensor( + b_tensor.name + "_DQ_zero_points", qtype, scales.shape, zero_points.tobytes(), True + ) + dq_input_names.append(zp_tensor.name) + b_graph.initializer.extend([zp_tensor]) + dq_kwargs = {"axis": 0, "block_size": self.config.block_size} + dq_node = onnx.helper.make_node( + "DequantizeLinear", + inputs=dq_input_names, + outputs=dq_output_names, + name=node.name + "_DQ_Q4" if node.name else "", + **dq_kwargs, + ) + matmul_node = onnx.helper.make_node( + "MatMul", + inputs=matmul_input_names, + outputs=matmul_output_names, + name=node.name + "_matmul_Q4" if node.name else "", + ) + output_nodes.extend([dq_node, matmul_node]) - return matmul_q4_node + logger.info(f"complete quantization of {node.name} ...") + return output_nodes class MatMul4BitsQuantizer: - """Perform 4b quantization of constant MatMul weights""" + """ + Perform 4b quantization of constant MatMul weights. + If algo_config.quant_format is QOperator, the quantized weight is stored in a MatMulNBits node, which relaces the + MatMul node. + If algo_config.quant_format is QDQ, the quantized weight is stored in a DeQuantizeLinear node. The MatMul node is + replaced by the DequantizeLinear + MatMul nodes. + """ def __init__( self, @@ -475,7 +577,8 @@ def __init__( is_symmetric: bool = False, accuracy_level: int | None = None, nodes_to_exclude=None, - algo_config: WeightOnlyQuantConfig = None, + quant_format=QuantFormat.QOperator, + algo_config: WeightOnlyQuantConfig | None = None, ): if nodes_to_exclude is None: nodes_to_exclude = [] @@ -488,7 +591,10 @@ def __init__( self.node_quantizer = None if algo_config is None: algo_config = DefaultWeightOnlyQuantConfig( - block_size=block_size, is_symmetric=is_symmetric, accuracy_level=accuracy_level + block_size=block_size, + is_symmetric=is_symmetric, + accuracy_level=accuracy_level, + quant_format=quant_format, ) self.algo_config = algo_config if algo_config.algorithm == "HQQ": @@ -526,15 +632,15 @@ def _process_subgraph(self, graph_stack: list[GraphProto]): node = onnx.helper.make_node( # noqa: PLW2901 node.op_type, node.input, node.output, name=node.name, **kwargs ) - out_node = None + out_nodes = [] if node.name in self.nodes_to_exclude: logger.info(f"exclude to quantize {node.name} as specified by nodes_to_exclude...") - out_node = node + out_nodes = [node] elif self.algo_config is not None and self.algo_config.algorithm == "HQQ": - out_node = self.node_quantizer.quantize(node, graph_stack) + out_nodes = self.node_quantizer.quantize(node, graph_stack) else: - out_node = self.node_quantizer.quantize(node, graph_stack) - new_nodes.append(out_node) + out_nodes = self.node_quantizer.quantize(node, graph_stack) + new_nodes.extend(out_nodes) graph.ClearField("node") graph.node.extend(new_nodes) @@ -688,6 +794,15 @@ def parse_args(): default=[], help="Specify the nodes to be excluded from quantization with node names", ) + parser.add_argument( + "--quant_format", + default="QOperator", + type=QuantFormat, + choices=list(QuantFormat), + help="QuantFormat {QOperator, QDQ}" + "QOperator format quantizes the model with quantized operators directly." + "QDQ format quantize the model by inserting DeQuantizeLinear before the MatMul.", + ) return parser.parse_args() @@ -699,6 +814,7 @@ def parse_args(): input_model_path = args.input_model output_model_path = args.output_model + quant_format = args.quant_format if os.path.exists(output_model_path): logger.error(f"file {output_model_path} already exists") @@ -713,7 +829,10 @@ def parse_args(): quant_config = HQQWeightOnlyQuantConfig(block_size=args.block_size, bits=args.bits) elif args.quant_method == "default": quant_config = DefaultWeightOnlyQuantConfig( - block_size=args.block_size, is_symmetric=args.symmetric, accuracy_level=args.accuracy_level + block_size=args.block_size, + is_symmetric=args.symmetric, + accuracy_level=args.accuracy_level, + quant_format=quant_format, ) elif args.quant_method == "rtn": quant_config = RTNWeightOnlyQuantConfig() diff --git a/onnxruntime/python/tools/tensorrt/perf/build/build_image.py b/onnxruntime/python/tools/tensorrt/perf/build/build_image.py index 9ee8f27df5c99..2f335009b59c6 100644 --- a/onnxruntime/python/tools/tensorrt/perf/build/build_image.py +++ b/onnxruntime/python/tools/tensorrt/perf/build/build_image.py @@ -15,12 +15,10 @@ from typing import List, Optional TRT_DOCKER_FILES = { - "8.4.cuda_11_6_cudnn_8": "tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_6_tensorrt8_4", - "8.5.cuda_11_8_cudnn_8": "tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_8_tensorrt8_5", "8.6.cuda_11_8_cudnn_8": "tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_8_tensorrt8_6", "8.6.cuda_12_3_cudnn_9": "tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda12_3_tensorrt8_6", - "10.0.cuda_11_8_cudnn_8": "tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_8_tensorrt10_0", - "10.0.cuda_12_4_cudnn_9": "tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda12_4_tensorrt10_0", + "10.2.cuda_11_8_cudnn_8": "tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_tensorrt10", + "10.2.cuda_12_5_cudnn_9": "tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda12_tensorrt10", "BIN": "tools/ci_build/github/linux/docker/Dockerfile.ubuntu_tensorrt_bin", } diff --git a/onnxruntime/python/tools/transformers/models/llama/README.md b/onnxruntime/python/tools/transformers/models/llama/README.md index 6fba98c14e792..cd8a8756d681e 100644 --- a/onnxruntime/python/tools/transformers/models/llama/README.md +++ b/onnxruntime/python/tools/transformers/models/llama/README.md @@ -27,8 +27,6 @@ Please note the package versions needed for using LLaMA-2 in the `requirements.t - Note that `torch` with CUDA enabled is not installed automatically. This is because `torch` should be installed with the CUDA version used on your machine. Please visit [the PyTorch website](https://pytorch.org/get-started/locally/) to download the `torch` version that is used with the CUDA version installed on your machine and satisfies the requirement listed in the file. - `requirements-quant.txt` - For running the SmoothQuant algorithm using [Intel's Neural Compressor](https://github.com/intel/neural-compressor) -- `requirements-70b-model.txt` - - For running the LLaMA-2 70B model on multiple GPUs - `requirements.txt` - Package versions needed in each of the above files @@ -221,18 +219,6 @@ $ python3 -m models.llama.convert_to_onnx -m meta-llama/Llama-2-7b-hf --output l $ python3 -m onnxruntime.transformers.models.llama.convert_to_onnx -m meta-llama/Llama-2-7b-hf --output llama2-7b-int4-cpu --precision int4 --quantization_method blockwise --execution_provider cpu --use_gqa ``` -Export LLaMA-2 70B sharded model into 4 partitions -``` -# From source: -# 1. Install necessary packages from requirements-70b-model.txt -$ pip install -r requirements-70b-model.txt - -# 2. Build ONNX Runtime from source with NCCL enabled. Here is a sample command: -$ ./build.sh --config Release --use_cuda --cuda_home /usr/local/cuda-12.2 --cudnn_home /usr/local/cuda-12.2 --build_wheel --cuda_version=12.2 --parallel --skip_tests --enable_nccl --nccl_home /usr/local/cuda-12.2 --use_mpi --mpi_home=/usr/lib/x86_64-linux-gnu/ - -# 3. Shard and export the LLaMA-2 70B model. With FP16, you will need at least 140GB of GPU memory to load the model. Therefore, you will need at least 4 40GB A100 GPUs or 2 80GB A100 GPUs to shard the PyTorch model and export each shard to ONNX. Here is an example command: -$ CUDA_VISIBLE_DEVICES=0,1,2,3 bash convert_70b_model.sh 4 -m meta-llama/Llama-2-70b-hf --output llama2-70b-distributed --precision fp16 --execution_provider cuda --use_gqa -``` ## Parity Checking LLaMA-2 @@ -395,18 +381,6 @@ CUDA_VISIBLE_DEVICES=4 python3 -m models.llama.benchmark \ --device cuda ``` -9. ONNX Runtime, FP16, convert_to_onnx, LLaMA-2 70B shard to 4 GPUs -``` -CUDA_VISIBLE_DEVICES=4,5,6,7 bash benchmark_70b_model.sh 4 \ - --benchmark-type ort-convert-to-onnx \ - --ort-model-path ./llama2-70b-dis/rank_{}_Llama-2-70b-hf_decoder_merged_model_fp16.onnx \ - --model-name meta-llama/Llama-2-70b-hf \ - --cache-dir ./model_cache \ - --precision fp16 \ - --device cuda \ - --warmup-runs 5 \ - --num-runs 100 -``` You can profile a variant by adding the `--profile` flag and providing one batch size and sequence length combination. diff --git a/onnxruntime/python/tools/transformers/models/llama/benchmark_70b_model.sh b/onnxruntime/python/tools/transformers/models/llama/benchmark_70b_model.sh deleted file mode 100644 index 38f1916456658..0000000000000 --- a/onnxruntime/python/tools/transformers/models/llama/benchmark_70b_model.sh +++ /dev/null @@ -1,12 +0,0 @@ -#!/bin/bash - -NUM_GPUS=${1:-1} - -MPI="mpirun --allow-run-as-root - -mca btl_openib_warn_no_device_params_found 0 -mca pml ob1 -mca btl ^openib -mca btl_tcp_if_include eth0 - --tag-output --npernode $NUM_GPUS --bind-to numa - -x MIOPEN_FIND_MODE=1" - -CMD="$MPI python benchmark.py ${@:2}" - -$CMD \ No newline at end of file diff --git a/onnxruntime/python/tools/transformers/models/llama/convert_70b_model.sh b/onnxruntime/python/tools/transformers/models/llama/convert_70b_model.sh deleted file mode 100644 index 637d15c10e0c7..0000000000000 --- a/onnxruntime/python/tools/transformers/models/llama/convert_70b_model.sh +++ /dev/null @@ -1,12 +0,0 @@ -#!/bin/bash - -NUM_GPUS=${1:-1} - -MPI="mpirun --allow-run-as-root - -mca btl_openib_warn_no_device_params_found 0 -mca pml ob1 -mca btl ^openib -mca btl_tcp_if_include eth0 - --tag-output --npernode $NUM_GPUS --bind-to numa - -x MIOPEN_FIND_MODE=1" - -CMD="$MPI python convert_to_onnx.py ${@:2}" - -$CMD \ No newline at end of file diff --git a/onnxruntime/python/tools/transformers/models/llama/requirements-70b-model.txt b/onnxruntime/python/tools/transformers/models/llama/requirements-70b-model.txt deleted file mode 100644 index 572cfdb71be4a..0000000000000 --- a/onnxruntime/python/tools/transformers/models/llama/requirements-70b-model.txt +++ /dev/null @@ -1,4 +0,0 @@ --r requirements.txt -git+https://github.com/frankdongms/transformers.git@frdong/shard_llama -mpi4py -psutil \ No newline at end of file diff --git a/onnxruntime/test/flatbuffers/flatbuffer_utils_test.cc b/onnxruntime/test/flatbuffers/flatbuffer_utils_test.cc index 32f2da806be3b..467c5e773589a 100644 --- a/onnxruntime/test/flatbuffers/flatbuffer_utils_test.cc +++ b/onnxruntime/test/flatbuffers/flatbuffer_utils_test.cc @@ -12,7 +12,6 @@ #include "core/graph/graph_flatbuffers_utils.h" #include "core/framework/tensorprotoutils.h" #include "core/providers/cpu/cpu_execution_provider.h" - #include "test/flatbuffers/flatbuffers_utils_test.fbs.h" #include "test/util/include/asserts.h" @@ -116,6 +115,10 @@ ONNX_NAMESPACE::TensorProto CreateInitializer(const std::string& name, ORT_THROW("Unsupported data type: ", data_type); } + if constexpr (endian::native != endian::little) { + utils::ConvertRawDataInTensorProto(&tp); + } + return tp; } @@ -258,6 +261,9 @@ TEST(FlatbufferUtilsTest, ExternalWriteReadWithLoadInitializers) { for (const auto* fbs_tensor : *fbs_tensors2) { ONNX_NAMESPACE::TensorProto initializer; ASSERT_STATUS_OK(LoadInitializerOrtFormat(*fbs_tensor, initializer, options, reader)); + if constexpr (endian::native != endian::little) { + utils::ConvertRawDataInTensorProto(&initializer); + } loaded_initializers.emplace_back(std::move(initializer)); // also check that the loaded flatbuffer tensors have accurately written to the external_data_offset field if (fbs_tensor->data_type() != fbs::TensorDataType::STRING && fbs_tensor->name()->str() != "tensor_32_small") { diff --git a/onnxruntime/test/framework/sparse_kernels_test.cc b/onnxruntime/test/framework/sparse_kernels_test.cc index fa42bb6e96cd5..7bd6b47f52b7d 100644 --- a/onnxruntime/test/framework/sparse_kernels_test.cc +++ b/onnxruntime/test/framework/sparse_kernels_test.cc @@ -705,6 +705,9 @@ struct InsertIndices { // Conversion on the fly to the target data type std::vector indices(indices_data.cbegin(), indices_data.cend()); indices_tp.mutable_raw_data()->assign(reinterpret_cast(indices.data()), indices.size() * sizeof(T)); + if constexpr (endian::native != endian::little) { + utils::ConvertRawDataInTensorProto((ONNX_NAMESPACE::TensorProto*)&indices_tp); + } } } }; @@ -837,7 +840,7 @@ static void TestConversion( template static void RawDataWriter(const std::vector& values, TensorProto& tp, TensorProto_DataType datatype) { tp.set_data_type(datatype); - tp.set_raw_data(values.data(), values.size() * sizeof(T)); + utils::SetRawDataInTensorProto(tp, values.data(), values.size() * sizeof(T)); } int64_t ActualSize(const TensorProto& actual) { diff --git a/onnxruntime/test/framework/tensorutils_test.cc b/onnxruntime/test/framework/tensorutils_test.cc index 05bdb3a9a033d..6821f582ce2de 100644 --- a/onnxruntime/test/framework/tensorutils_test.cc +++ b/onnxruntime/test/framework/tensorutils_test.cc @@ -30,7 +30,7 @@ void TestUnpackFloatTensor(TensorProto_DataType type, const std::filesystem::pat for (int i = 0; i < 4; ++i) { memcpy(rawdata + i * sizeof(T), &(f[i]), sizeof(T)); } - float_tensor_proto.set_raw_data(rawdata, len); + utils::SetRawDataInTensorProto(float_tensor_proto, rawdata, len); T float_data2[4]; auto status = UnpackTensor(float_tensor_proto, model_path, float_data2, 4); EXPECT_TRUE(status.IsOK()) << status.ErrorMessage(); @@ -102,8 +102,25 @@ std::vector CreateValues() { return {BFloat16(0.f), BFloat16(1.f), BFloat16(2.f), BFloat16(3.f)}; } +template +void ConvertEndianessForVector(const std::vector& test_data) { + const size_t element_size = sizeof(T); + const size_t num_elements = test_data.size(); + char* bytes = reinterpret_cast(const_cast(test_data.data())); + for (size_t i = 0; i < num_elements; ++i) { + char* start_byte = bytes + i * element_size; + char* end_byte = start_byte + element_size - 1; + for (size_t count = 0; count < element_size / 2; ++count) { + std::swap(*start_byte++, *end_byte--); + } + } +} + template void WriteDataToFile(FILE* fp, const std::vector& test_data) { + if constexpr (endian::native != endian::little) { + ConvertEndianessForVector(test_data); + } size_t size_in_bytes = test_data.size() * sizeof(T); ASSERT_EQ(size_in_bytes, fwrite(test_data.data(), 1, size_in_bytes, fp)); } @@ -147,6 +164,9 @@ void UnpackAndValidate(const TensorProto& tensor_proto, const std::filesystem::p std::vector val(test_data.size()); auto st = utils::UnpackTensor(tensor_proto, model_path, val.data(), test_data.size()); ASSERT_TRUE(st.IsOK()) << st.ErrorMessage(); + if constexpr (endian::native != endian::little) { + ConvertEndianessForVector(val); + } // Validate data for (size_t i = 0; i < test_data.size(); i++) { @@ -325,6 +345,9 @@ static void TestConstantNodeConversionWithExternalData(TensorProto_DataType type std::vector val(test_data.size()); auto st = utils::UnpackTensor(tp, model_path, val.data(), test_data.size()); ASSERT_TRUE(st.IsOK()) << st.ErrorMessage(); + if constexpr (endian::native != endian::little) { + ConvertEndianessForVector(val); + } for (size_t i = 0; i < test_data.size(); i++) { ASSERT_EQ(val[i], test_data[i]); } diff --git a/onnxruntime/test/framework/test_tensor_loader.cc b/onnxruntime/test/framework/test_tensor_loader.cc index 17edad73085c9..73bf351b6c556 100644 --- a/onnxruntime/test/framework/test_tensor_loader.cc +++ b/onnxruntime/test/framework/test_tensor_loader.cc @@ -104,6 +104,18 @@ static void run_external_data_test() { std::unique_ptr file_deleter(const_cast(filename.c_str()), DeleteFileFromDisk); float test_data[] = {1.0f, 2.2f, 3.5f}; + if constexpr (endian::native != endian::little) { + const int element_size = sizeof(float); + char* bytes = reinterpret_cast(test_data); + const size_t num_elements = std::size(test_data); + for (size_t i = 0; i < num_elements; ++i) { + char* start_byte = bytes + i * element_size; + char* end_byte = start_byte + element_size - 1; + for (size_t count = 0; count < element_size / 2; ++count) { + std::swap(*start_byte++, *end_byte--); + } + } + } ASSERT_EQ(sizeof(test_data), fwrite(test_data, 1, sizeof(test_data), fp)); ASSERT_EQ(0, fclose(fp)); // construct a tensor proto @@ -128,8 +140,12 @@ static void run_external_data_test() { len = GetCurrentDirectoryW(len, (ORTCHAR_T*)cwd.data()); ASSERT_NE(len, (DWORD)0); cwd.append(ORT_TSTR("\\fake.onnx")); +#else +#if defined(_AIX) + char* p = getcwd(nullptr, PATH_MAX); #else char* p = getcwd(nullptr, 0); +#endif ASSERT_NE(p, nullptr); cwd = p; free(p); diff --git a/onnxruntime/test/mlas/bench/bench_q4dq.cpp b/onnxruntime/test/mlas/bench/bench_q4dq.cpp index 00234ecfd2ce2..9d15c9a6bf994 100644 --- a/onnxruntime/test/mlas/bench/bench_q4dq.cpp +++ b/onnxruntime/test/mlas/bench/bench_q4dq.cpp @@ -69,6 +69,7 @@ static void BM_QDQBlockwiseQuantizer_TransposeColumnwise(benchmark::State& state int N = state.range(1); int quant_block_size = state.range(2); int threads = state.range(3); + bool add8 = state.range(4) != 0; int quant_num_M = (M + quant_block_size - 1) / quant_block_size; int blob_size = (quant_block_size + 1) / 2; size_t scale_size = quant_num_M * N; @@ -87,12 +88,22 @@ static void BM_QDQBlockwiseQuantizer_TransposeColumnwise(benchmark::State& state onnxruntime::concurrency::CreateThreadPool(&onnxruntime::Env::Default(), tpo, onnxruntime::concurrency::ThreadPoolType::INTRA_OP)); - for (auto _ : state) { - benchmark::DoNotOptimize(dst.data()); - MlasQDQTransposeBlockwiseQuantized( - dst.data(), scales.data(), zero_points.data(), dst_T.data(), scales_T.data(), zero_points_T.data(), - true, M, N, quant_block_size, tp.get()); - benchmark::ClobberMemory(); + if (add8) { + for (auto _ : state) { + benchmark::DoNotOptimize(dst.data()); + MlasQDQTransposeBlockwiseQuantized( + dst.data(), scales.data(), zero_points.data(), dst_T.data(), scales_T.data(), zero_points_T.data(), + true, M, N, quant_block_size, tp.get()); + benchmark::ClobberMemory(); + } + } else { + for (auto _ : state) { + benchmark::DoNotOptimize(dst.data()); + MlasQDQTransposeBlockwiseQuantized( + dst.data(), scales.data(), zero_points.data(), dst_T.data(), scales_T.data(), zero_points_T.data(), + true, M, N, quant_block_size, tp.get()); + benchmark::ClobberMemory(); + } } } @@ -113,6 +124,6 @@ BENCHMARK(BM_MlasQuantizeBlockwise) BENCHMARK(BM_QDQBlockwiseQuantizer_TransposeColumnwise) ->UseRealTime() ->Apply([](benchmark::internal::Benchmark* b) { - b->ArgNames({"M", "N", "quant_block_size", "threads"}); - b->ArgsProduct({{1024, 4096}, {4096, 4095}, {64, 128}, {2, 8, 16}}); + b->ArgNames({"M", "N", "quant_block_size", "threads", "add8"}); + b->ArgsProduct({{1024, 4096}, {4096, 4095}, {64, 128}, {2, 8, 16}, {0, 1}}); }); diff --git a/onnxruntime/test/mlas/unittest/test_blockq4.cpp b/onnxruntime/test/mlas/unittest/test_blockq4.cpp index b466e883059f4..f75002f715154 100644 --- a/onnxruntime/test/mlas/unittest/test_blockq4.cpp +++ b/onnxruntime/test/mlas/unittest/test_blockq4.cpp @@ -127,13 +127,22 @@ class MlasBlockwiseQdqTest : public MlasTestBase { columnwise, rows, columns, columns, threadpool_ptr); if (columnwise) { - MlasQDQQuantizeBlockwise( + bool signed_quant = MlasQDQQuantizeBlockwise( transposed, qdq_scales, qdq_zp, qdq_weights, true, rows, columns, block_size, threadpool_ptr); - MlasQDQTransposeBlockwiseQuantized( - qdq_weights, qdq_scales, qdq_zp, qdq_weights_T, qdq_scales_T, qdq_zp_T, - true, rows, columns, block_size, threadpool_ptr); + ASSERT_EQ(symmetric, signed_quant) << "symmetric quantization should be signed"; + + if (symmetric) { + MlasQDQTransposeBlockwiseQuantized( + qdq_weights, qdq_scales, qdq_zp, qdq_weights_T, qdq_scales_T, qdq_zp_T, + true, rows, columns, block_size, threadpool_ptr); + + } else { + MlasQDQTransposeBlockwiseQuantized( + qdq_weights, qdq_scales, qdq_zp, qdq_weights_T, qdq_scales_T, qdq_zp_T, + true, rows, columns, block_size, threadpool_ptr); + } } for (int c = 0; c < columns; c++) { diff --git a/onnxruntime/test/onnx/TestCase.h b/onnxruntime/test/onnx/TestCase.h index 0cb92056d378e..745a1fe9eeb50 100644 --- a/onnxruntime/test/onnx/TestCase.h +++ b/onnxruntime/test/onnx/TestCase.h @@ -53,7 +53,8 @@ class TestModelInfo { public: virtual const std::filesystem::path& GetModelUrl() const = 0; virtual std::filesystem::path GetDir() const { - return GetModelUrl().parent_path(); + const auto& p = GetModelUrl(); + return p.has_parent_path() ? p.parent_path() : std::filesystem::current_path(); } virtual const std::string& GetNodeName() const = 0; virtual const ONNX_NAMESPACE::ValueInfoProto* GetInputInfoFromModel(size_t i) const = 0; diff --git a/onnxruntime/test/onnx/main.cc b/onnxruntime/test/onnx/main.cc index fc29756a1ff98..9886d98dcc6d6 100644 --- a/onnxruntime/test/onnx/main.cc +++ b/onnxruntime/test/onnx/main.cc @@ -8,6 +8,8 @@ #include #ifdef _WIN32 #include "getopt.h" +#elif defined(_AIX) +#include #else #include #include diff --git a/onnxruntime/test/onnx/tensorprotoutils.cc b/onnxruntime/test/onnx/tensorprotoutils.cc index 5df055f862a86..50ab2290c6456 100644 --- a/onnxruntime/test/onnx/tensorprotoutils.cc +++ b/onnxruntime/test/onnx/tensorprotoutils.cc @@ -6,6 +6,7 @@ #include #include #include +#include #include "mem_buffer.h" #include "core/common/safeint.h" @@ -68,11 +69,22 @@ static void UnpackTensorWithRawData(const void* raw_data, size_t raw_data_length ORT_CXX_API_THROW(MakeString("UnpackTensor: the pre-allocated size does not match the raw data size, expected ", expected_size_in_bytes, ", got ", raw_data_length), OrtErrorCode::ORT_FAIL); + memcpy(p_data, raw_data, raw_data_length); if constexpr (endian::native != endian::little) { - ORT_CXX_API_THROW("UnpackTensorWithRawData only handles little-endian native byte order for now.", - OrtErrorCode::ORT_NOT_IMPLEMENTED); + /* Convert Endianness */ + char* bytes = reinterpret_cast(p_data); + size_t element_size = sizeof(T); + size_t num_elements = raw_data_length / element_size; + + for (size_t i = 0; i < num_elements; ++i) { + char* start_byte = bytes + i * element_size; + char* end_byte = start_byte + element_size - 1; + /* keep swapping */ + for (size_t count = 0; count < element_size / 2; ++count) { + std::swap(*start_byte++, *end_byte--); + } + } } - memcpy(p_data, raw_data, raw_data_length); } template <> diff --git a/onnxruntime/test/optimizer/graph_transform_test.cc b/onnxruntime/test/optimizer/graph_transform_test.cc index 2bfa57a2ceb9e..3e4e845440117 100755 --- a/onnxruntime/test/optimizer/graph_transform_test.cc +++ b/onnxruntime/test/optimizer/graph_transform_test.cc @@ -4972,8 +4972,8 @@ TEST_F(GraphTransformationTests, CseWithConstantOfShape) { TensorProto value_tensor; value_tensor.add_dims(1); float value = 2.333f; - value_tensor.set_raw_data(reinterpret_cast(&value), sizeof(float)); value_tensor.set_data_type(ONNX_NAMESPACE::TensorProto_DataType_FLOAT); + utils::SetRawDataInTensorProto(value_tensor, reinterpret_cast(&value), sizeof(float)); builder.AddNode("ConstantOfShape", {shape_out_1}, {constant_of_shape_out_1}).AddAttribute("value", value_tensor); builder.AddNode("ConstantOfShape", {shape_out_2}, {constant_of_shape_out_2}).AddAttribute("value", value_tensor); builder.AddNode("Mul", {input_arg, constant_of_shape_out_1}, {mul_out_1}); diff --git a/onnxruntime/test/optimizer/graph_transform_test_builder.cc b/onnxruntime/test/optimizer/graph_transform_test_builder.cc index 73c8b3f119103..2cbfbbb317642 100644 --- a/onnxruntime/test/optimizer/graph_transform_test_builder.cc +++ b/onnxruntime/test/optimizer/graph_transform_test_builder.cc @@ -61,7 +61,7 @@ NodeArg* ModelTestBuilder::MakeInitializer(gsl::span shape, ONNX_NAMESPACE::TensorProto tensor_proto; tensor_proto.set_name(name); tensor_proto.set_data_type(elem_type); - tensor_proto.set_raw_data(raw_data.data(), raw_data.size()); + utils::SetRawDataInTensorProto(tensor_proto, raw_data.data(), raw_data.size()); for (auto& dim : shape) { tensor_proto.add_dims(dim); diff --git a/onnxruntime/test/optimizer/graph_transform_test_builder.h b/onnxruntime/test/optimizer/graph_transform_test_builder.h index 0282d09f340b2..6214094a26c4f 100644 --- a/onnxruntime/test/optimizer/graph_transform_test_builder.h +++ b/onnxruntime/test/optimizer/graph_transform_test_builder.h @@ -13,6 +13,7 @@ #include "core/framework/int4.h" #include "core/optimizer/graph_transformer_level.h" #include "core/graph/onnx_protobuf.h" +#include "core/framework/tensorprotoutils.h" #include "test/framework/test_utils.h" #include "test/common/tensor_op_test_utils.h" #include "test/framework/test_utils.h" @@ -249,7 +250,7 @@ class ModelTestBuilder { tensor_proto.set_data_type(utils::ToTensorProtoElementType()); std::unique_ptr data_buffer = std::make_unique(data.size()); for (size_t i = 0; i < data.size(); ++i) data_buffer[i] = data[i]; - tensor_proto.set_raw_data(data_buffer.get(), data.size()); + utils::SetRawDataInTensorProto(tensor_proto, data_buffer.get(), data.size()); for (auto& dim : shape) { tensor_proto.add_dims(dim); diff --git a/onnxruntime/test/optimizer/initializer_test.cc b/onnxruntime/test/optimizer/initializer_test.cc index 522e96e762d5a..391942acfca35 100644 --- a/onnxruntime/test/optimizer/initializer_test.cc +++ b/onnxruntime/test/optimizer/initializer_test.cc @@ -163,8 +163,8 @@ void TestInitializerRawData() { tensor_proto.set_name("OptimizerInitializerTest_RawData"); tensor_proto.add_dims(3); tensor_proto.add_dims(4); - tensor_proto.set_raw_data(data.data(), data.size() * sizeof(T)); + utils::SetRawDataInTensorProto(tensor_proto, data.data(), data.size() * sizeof(T)); const Initializer init(tensor_proto, std::filesystem::path()); for (size_t idx = 0; idx < data.size(); idx++) { diff --git a/onnxruntime/test/optimizer/nchwc_optimizer_test.cc b/onnxruntime/test/optimizer/nchwc_optimizer_test.cc index 8e4edc9e0abbb..538f60040418c 100644 --- a/onnxruntime/test/optimizer/nchwc_optimizer_test.cc +++ b/onnxruntime/test/optimizer/nchwc_optimizer_test.cc @@ -6,6 +6,7 @@ #include "core/mlas/inc/mlas.h" #include "core/session/environment.h" #include "core/session/inference_session.h" +#include "core/framework/tensorprotoutils.h" #include "test/compare_ortvalue.h" #include "test/test_environment.h" #include "test/framework/test_utils.h" @@ -62,7 +63,7 @@ struct NchwcTestHelper { ONNX_NAMESPACE::TensorProto tensor_proto; tensor_proto.set_name(name); tensor_proto.set_data_type(utils::ToTensorProtoElementType()); - tensor_proto.set_raw_data(data.data(), data.size() * sizeof(T)); + utils::SetRawDataInTensorProto(tensor_proto, data.data(), data.size() * sizeof(T)); for (auto& dim : shape) { tensor_proto.add_dims(dim); diff --git a/onnxruntime/test/providers/base_tester.cc b/onnxruntime/test/providers/base_tester.cc index 1db8616c85daa..01de15e6f8ec8 100644 --- a/onnxruntime/test/providers/base_tester.cc +++ b/onnxruntime/test/providers/base_tester.cc @@ -73,7 +73,7 @@ void BaseTester::AddInitializers(onnxruntime::Graph& graph) { } } else { auto buffer_size = tensor.DataType()->Size() * shape.Size(); - tensor_proto.set_raw_data(tensor.DataRaw(), buffer_size); + utils::SetRawDataInTensorProto(tensor_proto, tensor.DataRaw(), buffer_size); } // 4. name diff --git a/onnxruntime/test/providers/cpu/generator/random_test.cc b/onnxruntime/test/providers/cpu/generator/random_test.cc index be049d1cf0ce3..ec9b1614488a7 100644 --- a/onnxruntime/test/providers/cpu/generator/random_test.cc +++ b/onnxruntime/test/providers/cpu/generator/random_test.cc @@ -256,7 +256,7 @@ TEST(Random, MultinomialGoodCase) { const std::vector output_dims{batch_size, num_samples}; #ifdef _WIN32 const std::vector expected_output{2, 0, 0, 2, 2, 2, 0, 2, 2, 1, 1, 2, 1, 1, 1, 1, 2, 1, 2, 0}; -#elif defined(__MACH__) || defined(__ANDROID__) || defined(__FreeBSD__) || defined(__wasm__) +#elif defined(__MACH__) || defined(__ANDROID__) || defined(__FreeBSD__) || defined(__wasm__) || defined(_AIX) const std::vector expected_output{1, 1, 2, 2, 0, 2, 2, 2, 0, 2, 1, 1, 2, 0, 2, 2, 0, 2, 1, 1}; #else const std::vector expected_output{2, 0, 0, 1, 0, 1, 2, 0, 1, 0, 0, 1, 1, 0, 1, 0, 2, 0, 2, 0}; @@ -294,7 +294,7 @@ TEST(Random, MultinomialDefaultDType) { #ifdef _WIN32 const std::vector expected_output_1{2, 0, 0, 2, 2, 2, 0, 2, 2, 1, 1, 2, 1, 1, 1, 1, 2, 1, 2, 0}; const std::vector expected_output_2{0, 0, 1, 0, 2, 2, 2, 0, 2, 1, 2, 1, 0, 2, 0, 2, 2, 1, 2, 1}; -#elif defined(__MACH__) || defined(__ANDROID__) || defined(__FreeBSD__) || defined(__wasm__) +#elif defined(__MACH__) || defined(__ANDROID__) || defined(__FreeBSD__) || defined(__wasm__) || defined(_AIX) const std::vector expected_output_1{1, 1, 2, 2, 0, 2, 2, 2, 0, 2, 1, 1, 2, 0, 2, 2, 0, 2, 1, 1}; const std::vector expected_output_2{1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 2, 0, 1, 1, 0, 2, 2, 2, 1}; #else diff --git a/onnxruntime/test/providers/cpu/tensor/isinf_test.cc b/onnxruntime/test/providers/cpu/tensor/isinf_test.cc index bd97306142f18..4fc2e6c7c909b 100644 --- a/onnxruntime/test/providers/cpu/tensor/isinf_test.cc +++ b/onnxruntime/test/providers/cpu/tensor/isinf_test.cc @@ -18,13 +18,17 @@ constexpr double DOUBLE_NINF = -std::numeric_limits::infinity(); constexpr double DOUBLE_NAN = std::numeric_limits::quiet_NaN(); template -void run_is_inf_test(int opset, int64_t detect_positive, int64_t detect_negative, const std::initializer_list& input, const std::initializer_list& output) { +void run_is_inf_test(int opset, int64_t detect_positive, int64_t detect_negative, const std::initializer_list& input, const std::initializer_list& output, bool skip_trt = false) { OpTester test("IsInf", opset); test.AddAttribute("detect_positive", detect_positive); test.AddAttribute("detect_negative", detect_negative); test.AddInput("X", {onnxruntime::narrow(input.size())}, input); test.AddOutput("Y", {onnxruntime::narrow(output.size())}, output); - test.Run(); + if (skip_trt) { + test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider}); + } else { + test.Run(); + } } TEST(IsInfTest, test_isinf_float10) { @@ -124,7 +128,7 @@ TEST(IsInfTest, test_isinf_bfloat16) { std::initializer_list input = {BFloat16{-1.7f}, BFloat16::NaN, BFloat16::Infinity, 3.6_bfp16, BFloat16::NegativeInfinity, BFloat16::Infinity}; std::initializer_list output = {false, false, true, false, true, true}; - run_is_inf_test(20, 1, 1, input, output); + run_is_inf_test(20, 1, 1, input, output, true); // Skip as TRT10 supports BF16 but T4 GPU run on TRT CIs doesn't } TEST(IsInfTest, test_isinf_positive_bfloat16) { @@ -146,7 +150,7 @@ TEST(IsInfTest, test_Float8E4M3FN) { std::initializer_list input = { Float8E4M3FN(-1.0f), Float8E4M3FN(FLOAT_NAN, false), Float8E4M3FN(1.0f), Float8E4M3FN(FLOAT_NINF, false), Float8E4M3FN(FLOAT_NINF, false), Float8E4M3FN(FLOAT_INF, false)}; std::initializer_list output = {false, false, false, false, false, false}; - run_is_inf_test(20, 1, 1, input, output); + run_is_inf_test(20, 1, 1, input, output, true); // Skip as TRT10.1 supports Float8 but T4 GPU run on TRT CIs doesn't } TEST(IsInfTest, test_Float8E4M3FNUZ) { diff --git a/onnxruntime/test/providers/cuda/test_cases/blkq4_fp16_gemm_sm80.h b/onnxruntime/test/providers/cuda/test_cases/blkq4_fp16_gemm_sm80.h index fa1c739c04e3a..f96c8ce9ce729 100644 --- a/onnxruntime/test/providers/cuda/test_cases/blkq4_fp16_gemm_sm80.h +++ b/onnxruntime/test/providers/cuda/test_cases/blkq4_fp16_gemm_sm80.h @@ -13,7 +13,7 @@ */ #pragma once - +#if defined(CUDA_VERSION) && CUDA_VERSION <= 12030 #include "test/cuda_host/blkq4_fp16_quant_sm80.h" #include @@ -197,3 +197,4 @@ void run_blkq4_small_gemm(int m, int n, int k); } // namespace test } // namespace cuda } // namespace onnxruntime +#endif diff --git a/onnxruntime/test/providers/cuda/test_cases/blkq4_fp16_gemm_sm80_test.cc b/onnxruntime/test/providers/cuda/test_cases/blkq4_fp16_gemm_sm80_test.cc index b95e093e41eab..3fcb9045ee7e6 100644 --- a/onnxruntime/test/providers/cuda/test_cases/blkq4_fp16_gemm_sm80_test.cc +++ b/onnxruntime/test/providers/cuda/test_cases/blkq4_fp16_gemm_sm80_test.cc @@ -10,7 +10,7 @@ * This part requires gtest header files, which do not play * well with CUTLASS headers. */ - +#if defined(CUDA_VERSION) && CUDA_VERSION <= 12030 #include "blkq4_fp16_gemm_sm80.h" #include "gtest/gtest.h" @@ -341,3 +341,4 @@ TEST(BlkQ4_GEMM, Sm80SmallTileKernelTest) { } // namespace test } // namespace onnxruntime +#endif diff --git a/onnxruntime/test/providers/cuda/test_cases/blkq4_fp16_gemm_sm80_testcu.cu b/onnxruntime/test/providers/cuda/test_cases/blkq4_fp16_gemm_sm80_testcu.cu index f5600ca9885a3..8b27c3d8c3aed 100644 --- a/onnxruntime/test/providers/cuda/test_cases/blkq4_fp16_gemm_sm80_testcu.cu +++ b/onnxruntime/test/providers/cuda/test_cases/blkq4_fp16_gemm_sm80_testcu.cu @@ -11,6 +11,9 @@ * well with gtest headers. */ +// This test has build error with cuda 12.5 +#if defined(CUDA_VERSION) && CUDA_VERSION <= 12030 + #include "blkq4_fp16_gemm_sm80.h" #include @@ -532,3 +535,5 @@ template void run_blkq4_small_gemm<128, false, false>(int m, int n, int k); } // namespace test } // namespace cuda } // namespace onnxruntime + +#endif diff --git a/onnxruntime/test/python/quantization/test_op_matmul_4bits.py b/onnxruntime/test/python/quantization/test_op_matmul_4bits.py index 88e5052db4e2e..4cc8a0c151d14 100644 --- a/onnxruntime/test/python/quantization/test_op_matmul_4bits.py +++ b/onnxruntime/test/python/quantization/test_op_matmul_4bits.py @@ -14,7 +14,7 @@ import numpy as np import onnx from onnx import TensorProto, helper -from op_test_utils import TestDataFeeds, check_model_correctness, check_op_type_count +from op_test_utils import TestDataFeeds, check_model_correctness, check_op_type_count, check_qtype_by_node_type from onnxruntime.quantization import quant_utils @@ -105,8 +105,9 @@ def make_matmul( [output_tensor], initializer=initializers, ) - model = helper.make_model(graph, opset_imports=[helper.make_opsetid("", 13)]) - model.ir_version = 7 # use stable onnx ir version + # blocked quantization requires DQ op set >= 21 + model = helper.make_model(graph, opset_imports=[helper.make_opsetid("", 21)]) + model.ir_version = 10 # use stable onnx ir version onnx.save(model, output_model_path) @@ -116,9 +117,12 @@ def quant_test( data_reader: TestDataFeeds, block_size: int, is_symmetric: bool, + quant_format: quant_utils.QuantFormat = quant_utils.QuantFormat.QOperator, ): + use_qdq = quant_format == quant_utils.QuantFormat.QDQ + name_prefix = "DQ_MatMul" if use_qdq else "MatMulNBits" model_int4_path = str( - Path(self._tmp_model_dir.name).joinpath(f"MatMulNBits_{block_size}_{is_symmetric}.onnx").absolute() + Path(self._tmp_model_dir.name).joinpath(f"{name_prefix}_{block_size}_{is_symmetric}.onnx").absolute() ) # Quantize fp32 model to int4 model @@ -126,15 +130,33 @@ def quant_test( model = quant_utils.load_model_with_shape_infer(Path(model_fp32_path)) quant_config = matmul_4bits_quantizer.DefaultWeightOnlyQuantConfig( - block_size=block_size, is_symmetric=is_symmetric + block_size=block_size, is_symmetric=is_symmetric, quant_format=quant_format ) quant = matmul_4bits_quantizer.MatMul4BitsQuantizer(model, algo_config=quant_config) quant.process() quant.model.save_model_to_file(model_int4_path, False) - quant_nodes = {"MatMulNBits": 1} + quant_nodes = {"DequantizeLinear": 1, "MatMul": 1} if use_qdq else {"MatMulNBits": 1} check_op_type_count(self, model_int4_path, **quant_nodes) + if use_qdq: + dq_qtype = TensorProto.INT4 if is_symmetric else TensorProto.UINT4 + dqnode_io_qtypes = ( + { + "DequantizeLinear": [ + ["i", 0, dq_qtype], + ] + } + if is_symmetric + else { + "DequantizeLinear": [ + ["i", 0, dq_qtype], + ["i", 2, dq_qtype], + ] + } + ) + check_qtype_by_node_type(self, model_int4_path, dqnode_io_qtypes) + data_reader.rewind() try: @@ -211,6 +233,26 @@ def test_quantize_matmul_int4_offsets(self): data_reader = self.input_feeds(1, {"input": [100, 52]}) self.quant_test(model_fp32_path, data_reader, 32, False) + @unittest.skipIf( + find_spec("onnxruntime.training"), "Skip because training package doesn't has quantize_matmul_4bits" + ) + def test_quantize_matmul_int4_symmetric_qdq(self): + np.random.seed(13) + + model_fp32_path = str(Path(self._tmp_model_dir.name).joinpath("matmul_fp32_symmetric.onnx").absolute()) + self.construct_model_matmul(model_fp32_path, symmetric=True) + data_reader = self.input_feeds(1, {"input": [100, 52]}) + self.quant_test(model_fp32_path, data_reader, 32, True, quant_utils.QuantFormat.QDQ) + + @unittest.skipIf( + find_spec("onnxruntime.training"), "Skip because training package doesn't has quantize_matmul_4bits" + ) + def test_quantize_matmul_int4_offsets_qdq(self): + model_fp32_path = str(Path(self._tmp_model_dir.name).joinpath("matmul_fp32_offset.onnx").absolute()) + self.construct_model_matmul(model_fp32_path, symmetric=False) + data_reader = self.input_feeds(1, {"input": [100, 52]}) + self.quant_test(model_fp32_path, data_reader, 32, False, quant_utils.QuantFormat.QDQ) + @unittest.skipIf( find_spec("onnxruntime.training"), "Skip because training package doesn't has quantize_matmul_4bits" ) diff --git a/onnxruntime/test/python/transformers/test_flash_attn_rocm.py b/onnxruntime/test/python/transformers/test_flash_attn_rocm.py index fe7e39722237f..880f4175e00b7 100644 --- a/onnxruntime/test/python/transformers/test_flash_attn_rocm.py +++ b/onnxruntime/test/python/transformers/test_flash_attn_rocm.py @@ -35,8 +35,8 @@ def test_gqa_no_past_flash_attention(self, _, config, local, rotary, rotary_inte rotary=rotary, rotary_interleaved=rotary_interleaved, packed=packed, - rtol=0.002, - atol=0.002, + rtol=0.001, + atol=0.005, ) parity_check_gqa_prompt_no_buff( config, @@ -45,8 +45,8 @@ def test_gqa_no_past_flash_attention(self, _, config, local, rotary, rotary_inte rotary=rotary, rotary_interleaved=rotary_interleaved, packed=packed, - rtol=0.002, - atol=0.002, + rtol=0.001, + atol=0.005, ) @parameterized.expand(gqa_past_flash_attention_test_cases()) @@ -67,8 +67,8 @@ def test_gqa_past_flash_attention(self, _, config, local, rotary, rotary_interle rotary=rotary, rotary_interleaved=rotary_interleaved, packed=packed, - rtol=0.002, - atol=0.002, + rtol=0.001, + atol=0.005, ) parity_check_gqa_past_no_buff( config, @@ -77,8 +77,8 @@ def test_gqa_past_flash_attention(self, _, config, local, rotary, rotary_interle rotary=rotary, rotary_interleaved=rotary_interleaved, packed=packed, - rtol=0.002, - atol=0.002, + rtol=0.001, + atol=0.005, ) diff --git a/orttraining/tools/ci_test/results/ci-mi200.huggingface.bert-large-rocm6.1.json b/orttraining/tools/ci_test/results/ci-mi200.huggingface.bert-large-rocm6.1.json new file mode 100644 index 0000000000000..05fcf08cd3232 --- /dev/null +++ b/orttraining/tools/ci_test/results/ci-mi200.huggingface.bert-large-rocm6.1.json @@ -0,0 +1,57 @@ +{ + "steps": [ + { + "step": 20, + "loss": 2.0136 + }, + { + "step": 40, + "loss": 1.8466 + }, + { + "step": 60, + "loss": 1.7525 + }, + { + "step": 80, + "loss": 1.6682 + }, + { + "step": 100, + "loss": 1.658 + }, + { + "step": 120, + "loss": 1.6749 + }, + { + "step": 140, + "loss": 1.6263 + }, + { + "step": 160, + "loss": 1.6828 + }, + { + "step": 180, + "loss": 1.6145 + }, + { + "step": 200, + "loss": 1.6197 + }, + { + "step": 220, + "loss": 1.6353 + }, + { + "step": 240, + "loss": 1.5266 + }, + { + "step": 260, + "loss": 1.5441 + } + ], + "samples_per_second": 34.561 +} diff --git a/setup.py b/setup.py index 5750833ce35de..51feedcfd3286 100644 --- a/setup.py +++ b/setup.py @@ -56,6 +56,7 @@ def parse_arg_remove_string(argv, arg_name_equal): cuda_version = None rocm_version = None +is_migraphx = False is_rocm = False is_openvino = False # The following arguments are mutually exclusive @@ -64,8 +65,9 @@ def parse_arg_remove_string(argv, arg_name_equal): cuda_version = parse_arg_remove_string(sys.argv, "--cuda_version=") elif parse_arg_remove_boolean(sys.argv, "--use_rocm"): is_rocm = True - package_name = "onnxruntime-rocm" if not nightly_build else "ort-rocm-nightly" rocm_version = parse_arg_remove_string(sys.argv, "--rocm_version=") +elif parse_arg_remove_boolean(sys.argv, "--use_migraphx"): + is_migraphx = True elif parse_arg_remove_boolean(sys.argv, "--use_openvino"): is_openvino = True package_name = "onnxruntime-openvino" @@ -87,6 +89,9 @@ def parse_arg_remove_string(argv, arg_name_equal): elif parse_arg_remove_boolean(sys.argv, "--use_qnn"): package_name = "onnxruntime-qnn" +if is_rocm or is_migraphx: + package_name = "onnxruntime-rocm" if not nightly_build else "ort-rocm-nightly" + # PEP 513 defined manylinux1_x86_64 and manylinux1_i686 # PEP 571 defined manylinux2010_x86_64 and manylinux2010_i686 # PEP 599 defines the following platform tags: @@ -280,10 +285,21 @@ def finalize_options(self): return ret -providers_cuda_or_rocm = "libonnxruntime_providers_" + ("rocm.so" if is_rocm else "cuda.so") -providers_tensorrt_or_migraphx = "libonnxruntime_providers_" + ("migraphx.so" if is_rocm else "tensorrt.so") -providers_openvino = "libonnxruntime_providers_openvino.so" -providers_cann = "libonnxruntime_providers_cann.so" +providers_cuda_or_rocm = "onnxruntime_providers_" + ("rocm" if is_rocm else "cuda") +providers_tensorrt_or_migraphx = "onnxruntime_providers_" + ("migraphx" if is_migraphx else "tensorrt") +providers_openvino = "onnxruntime_providers_openvino" +providers_cann = "onnxruntime_providers_cann" + +if platform.system() == "Linux": + providers_cuda_or_rocm = "lib" + providers_cuda_or_rocm + ".so" + providers_tensorrt_or_migraphx = "lib" + providers_tensorrt_or_migraphx + ".so" + providers_openvino = "lib" + providers_openvino + ".so" + providers_cann = "lib" + providers_cann + ".so" +elif platform.system() == "Windows": + providers_cuda_or_rocm = providers_cuda_or_rocm + ".dll" + providers_tensorrt_or_migraphx = providers_tensorrt_or_migraphx + ".dll" + providers_openvino = providers_openvino + ".dll" + providers_cann = providers_cann + ".dll" # Additional binaries dl_libs = [] @@ -335,6 +351,9 @@ def finalize_options(self): "dnnl.dll", "mklml.dll", "libiomp5md.dll", + providers_cuda_or_rocm, + providers_tensorrt_or_migraphx, + providers_cann, "onnxruntime.dll", ] # DNNL, TensorRT & OpenVINO EPs are built as shared libs diff --git a/tools/ci_build/build.py b/tools/ci_build/build.py index ae4c9b27544ba..75fbf5d0851ae 100644 --- a/tools/ci_build/build.py +++ b/tools/ci_build/build.py @@ -613,6 +613,7 @@ def convert_arg_line_to_args(self, arg_line): "MinGW Makefiles", "Ninja", "NMake Makefiles", + "NMake Makefiles JOM", "Unix Makefiles", "Visual Studio 17 2022", "Xcode", @@ -2211,6 +2212,7 @@ def build_python_wheel( use_cuda, cuda_version, use_rocm, + use_migraphx, rocm_version, use_dnnl, use_tensorrt, @@ -2262,6 +2264,8 @@ def build_python_wheel( args.append("--use_rocm") if rocm_version: args.append(f"--rocm_version={rocm_version}") + elif use_migraphx: + args.append("--use_migraphx") elif use_openvino: args.append("--use_openvino") elif use_dnnl: @@ -2587,9 +2591,6 @@ def main(): if args.use_tensorrt: args.use_cuda = True - if args.use_migraphx: - args.use_rocm = True - if args.build_wheel or args.gen_doc or args.use_tvm or args.enable_training: args.enable_pybind = True @@ -2885,7 +2886,8 @@ def main(): # fail unexpectedly. Similar, if your packaging step forgot to copy a file into the package, we don't know it # either. if args.build: - # TODO: find asan DLL and copy it to onnxruntime/capi folder when args.enable_address_sanitizer is True and the target OS is Windows + # TODO: find asan DLL and copy it to onnxruntime/capi folder when args.enable_address_sanitizer is True and + # the target OS is Windows if args.build_wheel: nightly_build = bool(os.getenv("NIGHTLY_BUILD") == "1") default_training_package_device = bool(os.getenv("DEFAULT_TRAINING_PACKAGE_DEVICE") == "1") @@ -2896,6 +2898,7 @@ def main(): args.use_cuda, args.cuda_version, args.use_rocm, + args.use_migraphx, args.rocm_version, args.use_dnnl, args.use_tensorrt, diff --git a/tools/ci_build/github/apple/coreml_supported_mlprogram_ops.md b/tools/ci_build/github/apple/coreml_supported_mlprogram_ops.md index dec05ae066a4a..1bbb933f66ba4 100644 --- a/tools/ci_build/github/apple/coreml_supported_mlprogram_ops.md +++ b/tools/ci_build/github/apple/coreml_supported_mlprogram_ops.md @@ -18,3 +18,6 @@ Keep in sync with doco generated from /docs/execution-providers/CoreML-Execution |ai.onnx:Relu|| |ai.onnx:Reshape|| |ai.onnx:Sub|| +|ai.onnx:Sigmoid|| +|ai:onnx:Tanh|| +|ai:onnx:Transpose|| diff --git a/tools/ci_build/github/apple/test_ios_framework_build_settings.json b/tools/ci_build/github/apple/test_ios_framework_build_settings.json new file mode 100644 index 0000000000000..0572df6ecf72e --- /dev/null +++ b/tools/ci_build/github/apple/test_ios_framework_build_settings.json @@ -0,0 +1,30 @@ +{ + "build_osx_archs": { + "iphoneos": [ + "arm64" + ], + "iphonesimulator": [ + "arm64", + "x86_64" + ] + }, + "build_params": { + "base": [ + "--parallel", + "--use_xcode", + "--build_apple_framework", + "--use_coreml", + "--use_xnnpack", + "--skip_tests", + "--cmake_extra_defines=onnxruntime_BUILD_UNIT_TESTS=OFF" + ], + "iphoneos": [ + "--ios", + "--apple_deploy_target=13.0" + ], + "iphonesimulator": [ + "--ios", + "--apple_deploy_target=13.0" + ] + } +} diff --git a/tools/ci_build/github/azure-pipelines/android-x86_64-crosscompile-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/android-x86_64-crosscompile-ci-pipeline.yml index 72f236ec2e6cc..10d9a9a24d88a 100644 --- a/tools/ci_build/github/azure-pipelines/android-x86_64-crosscompile-ci-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/android-x86_64-crosscompile-ci-pipeline.yml @@ -48,12 +48,12 @@ parameters: stages: # Separate stage for building CPU vs NNAPI as we only want CodeQL to run on one of them so we don't get duplicate # issues for code that is built in both. We pick NNAPI as that includes the NNAPI EP code. -- stage: BUILD_CPU_STAGE +- stage: BUILD_AND_TEST_CPU dependsOn: [] variables: Codeql.Enabled: false jobs: - - job: Build_CPU_EP + - job: BUILD_AND_TEST_CPU pool: onnxruntime-Ubuntu2204-AMD-CPU workspace: clean: all @@ -78,12 +78,14 @@ stages: - script: sudo apt-get update -y && sudo apt-get install -y coreutils ninja-build displayName: Install coreutils and ninja - - template: "templates/use-android-ndk.yml" - + - template: templates/use-android-ndk.yml + - template: templates/use-android-emulator.yml + parameters: + create: true + start: true - script: | env | grep ANDROID displayName: View Android ENVs - - script: | python3 tools/ci_build/build.py \ --enable_lto \ @@ -96,42 +98,17 @@ stages: --skip_submodule_sync \ --parallel \ --cmake_generator=Ninja \ - --build_java \ - --skip_tests - displayName: CPU EP, Build - - - task: CopyFiles@2 - displayName: Copy apks - inputs: - contents: 'build/**/*.apk' - targetFolder: $(Build.ArtifactStagingDirectory) - overWrite: true - - - task: CopyFiles@2 - displayName: Copy test data - inputs: - contents: 'build/**/testdata/**' - targetFolder: $(Build.ArtifactStagingDirectory) - overWrite: true - - - task: CopyFiles@2 - displayName: Copy test executables - inputs: - contents: | - build/Debug/* - build/Debug/java/androidtest/android/** - targetFolder: $(Build.ArtifactStagingDirectory) - overWrite: true - - - task: PublishBuildArtifacts@1 - inputs: - pathToPublish: $(Build.ArtifactStagingDirectory) - artifactName: CPUBuildOutput + --build_java + displayName: CPU EP, Build and Test + - template: templates/use-android-emulator.yml + parameters: + stop: true - template: templates/clean-agent-build-directory-step.yml -- stage: BUILD_NNAPI_STAGE +- stage: BUILD_AND_TEST_NNAPI_EP dependsOn: [] + condition: notIn(variables['Build.Reason'], 'IndividualCI', 'BatchedCI') variables: Codeql.ProjectConfigPath: .github/workflows Codeql.Enabled: true @@ -140,14 +117,12 @@ stages: JobsTimeout: 120 ${{ else }}: JobsTimeout: 60 - jobs: - - job: Build_NNAPI_EP + - job: BUILD_AND_TEST_NNAPI_EP pool: onnxruntime-Ubuntu2204-AMD-CPU timeoutInMinutes: ${{ variables.JobsTimeout }} workspace: clean: all - condition: notIn(variables['Build.Reason'], 'IndividualCI', 'BatchedCI') steps: - task: UsePythonVersion@0 displayName: Use Python $(pythonVersion) @@ -163,8 +138,10 @@ stages: - script: sudo apt-get update -y && sudo apt-get install -y coreutils ninja-build displayName: Install coreutils and ninja - - - template: "templates/use-android-ndk.yml" + - template: templates/use-android-emulator.yml + parameters: + create: true + start: true - script: | env | grep ANDROID @@ -172,194 +149,31 @@ stages: - script: | python3 tools/ci_build/build.py \ - --enable_lto \ - --android \ - --build_dir build_nnapi \ - --android_sdk_path $ANDROID_HOME \ - --android_ndk_path $ANDROID_NDK_HOME \ - --android_abi=x86_64 \ - --android_api=29 \ - --skip_submodule_sync \ - --parallel \ - --use_nnapi \ - --cmake_generator=Ninja \ - --build_java \ - --skip_tests - displayName: NNAPI EP, Build - - - task: CopyFiles@2 - displayName: Copy apks - inputs: - contents: 'build_nnapi/**/*.apk' - targetFolder: $(Build.ArtifactStagingDirectory) - overWrite: true - - - task: CopyFiles@2 - displayName: Copy test data - inputs: - contents: 'build_nnapi/**/testdata/**' - targetFolder: $(Build.ArtifactStagingDirectory) - overWrite: true - - - task: CopyFiles@2 - displayName: Copy Test Executables - inputs: - contents: | - build_nnapi/Debug/* - build_nnapi/Debug/java/androidtest/android/** - targetFolder: $(Build.ArtifactStagingDirectory) - overWrite: true - - - task: PublishBuildArtifacts@1 - inputs: - pathToPublish: $(Build.ArtifactStagingDirectory) - artifactName: NNAPIBuildOutput + --enable_lto \ + --android \ + --build_dir build_nnapi \ + --android_sdk_path $ANDROID_HOME \ + --android_ndk_path $ANDROID_NDK_HOME \ + --android_abi=x86_64 \ + --android_api=29 \ + --skip_submodule_sync \ + --parallel \ + --use_nnapi \ + --build_shared_lib \ + --cmake_generator=Ninja \ + --build_java + displayName: NNAPI EP, Build, Test on Android Emulator + + - script: /bin/bash tools/ci_build/github/linux/ort_minimal/nnapi_minimal_build_minimal_ort_and_run_tests.sh $(pwd) + # Build Minimal ORT with NNAPI and reduced Ops, run unit tests on Android Emulator + displayName: Build Minimal ORT with NNAPI and run tests + + - template: templates/use-android-emulator.yml + parameters: + stop: true - template: templates/clean-agent-build-directory-step.yml -- stage: TEST_STAGE - dependsOn: [BUILD_CPU_STAGE, BUILD_NNAPI_STAGE] - jobs: - - job: Test_CPU_EP - pool: - # We need macOS-12 to run the Android emulator for now. - # https://github.com/actions/runner-images/issues/7671 - vmImage: 'macOS-12' - workspace: - clean: all - condition: succeeded() - steps: - - script: | - set -ex - system_profiler SPSoftwareDataType SPHardwareDataType - displayName: 'Mac Agent Info' - - - task: DownloadPipelineArtifact@2 - inputs: - ${{ if eq(parameters.specificArtifact, true) }}: - source: 'specific' - project: 'onnxruntime' - pipeline: $(Build.DefinitionName) - runVersion: 'specific' - runId: ${{ parameters.runId }} - ${{ if ne(parameters.specificArtifact, true) }}: - source: 'current' - artifact: 'CPUBuildOutput' - path: $(Build.SourcesDirectory) - - - task: UsePythonVersion@0 - displayName: Use Python $(pythonVersion) - inputs: - versionSpec: $(pythonVersion) - - - task: JavaToolInstaller@0 - displayName: Use jdk 11 - inputs: - versionSpec: '11' - jdkArchitectureOption: 'x64' - jdkSourceOption: 'PreInstalled' - - - template: "templates/use-android-ndk.yml" - - - template: templates/use-android-emulator.yml - parameters: - create: true - start: true - - - script: | - python3 tools/ci_build/build.py \ - --enable_lto \ - --android \ - --build_dir build \ - --android_sdk_path $ANDROID_HOME \ - --android_ndk_path $ANDROID_NDK_HOME \ - --android_abi=x86_64 \ - --android_api=30 \ - --build_java \ - --test - displayName: CPU EP, Test on Android Emulator - - - template: templates/use-android-emulator.yml - parameters: - stop: true - - - template: templates/clean-agent-build-directory-step.yml - - - job: Test_NNAPI_EP - pool: - # We need macOS-12 to run the Android emulator for now. - # https://github.com/actions/runner-images/issues/7671 - vmImage: 'macOS-12' - timeoutInMinutes: 90 - workspace: - clean: all - condition: and(succeeded(), notIn(variables['Build.Reason'], 'IndividualCI', 'BatchedCI')) - steps: - - script: | - set -ex - system_profiler SPSoftwareDataType SPHardwareDataType - displayName: 'Mac Agent Info' - - - task: DownloadPipelineArtifact@2 - inputs: - ${{ if eq(parameters.specificArtifact, true) }}: - source: 'specific' - project: 'onnxruntime' - pipeline: $(Build.DefinitionName) - runVersion: 'specific' - runId: ${{ parameters.runId }} - ${{ if ne(parameters.specificArtifact, true) }}: - source: 'current' - artifact: 'NNAPIBuildOutput' - path: $(Build.SourcesDirectory) - - - task: UsePythonVersion@0 - displayName: Use Python $(pythonVersion) - inputs: - versionSpec: $(pythonVersion) - - - task: JavaToolInstaller@0 - displayName: Use jdk 11 - inputs: - versionSpec: '11' - jdkArchitectureOption: 'x64' - jdkSourceOption: 'PreInstalled' - - - template: "templates/use-android-ndk.yml" - - - template: templates/use-android-emulator.yml - parameters: - create: true - start: true - - - script: | - python3 tools/ci_build/build.py \ - --enable_lto \ - --android \ - --build_dir build_nnapi \ - --android_sdk_path $ANDROID_HOME \ - --android_ndk_path $ANDROID_NDK_HOME \ - --android_abi=x86_64 \ - --android_api=29 \ - --build_java \ - --use_nnapi \ - --test - displayName: NNAPI EP, Test, CodeCoverage on Android Emulator - - # used by Build Minimal ORT - - script: brew install coreutils ninja - displayName: Install coreutils and ninja - - - script: /bin/bash tools/ci_build/github/linux/ort_minimal/nnapi_minimal_build_minimal_ort_and_run_tests.sh $(pwd) - # Build Minimal ORT with NNAPI and reduced Ops, run unit tests on Android Emulator - displayName: Build Minimal ORT with NNAPI and run tests - - - template: templates/use-android-emulator.yml - parameters: - stop: true - - - template: templates/clean-agent-build-directory-step.yml - - stage: MASTER_BUILD_STAGE # The below jobs only run on master build. # because coverage report is hard to support in cross machines. @@ -368,20 +182,12 @@ stages: condition: in(variables['Build.Reason'], 'IndividualCI', 'BatchedCI') jobs: - job: NNAPI_EP_MASTER - pool: - # We need macOS-12 to run the Android emulator for now. - # https://github.com/actions/runner-images/issues/7671 - vmImage: 'macOS-12' + pool: onnxruntime-Ubuntu2204-AMD-CPU timeoutInMinutes: 180 workspace: clean: all condition: in(variables['Build.Reason'], 'IndividualCI', 'BatchedCI') steps: - - script: | - set -ex - system_profiler SPSoftwareDataType SPHardwareDataType - displayName: 'Mac Agent Info' - - task: UsePythonVersion@0 displayName: Use Python $(pythonVersion) inputs: @@ -394,11 +200,7 @@ stages: jdkArchitectureOption: 'x64' jdkSourceOption: 'PreInstalled' - - template: "templates/use-android-ndk.yml" - - # used by Build Minimal ORT - - script: brew install coreutils ninja - displayName: Install coreutils and ninja + - template: templates/use-android-ndk.yml - template: templates/use-android-emulator.yml parameters: @@ -429,50 +231,25 @@ stages: --build_dir build_nnapi \ --android_sdk_path $ANDROID_HOME displayName: Retrieve runtime code coverage files from the emulator and analyze + - script: cat '$(Build.SourcesDirectory)/build_nnapi/Debug/coverage_rpt.txt' displayName: Print coverage report - - task: PublishPipelineArtifact@0 - displayName: 'Publish code coverage report' - inputs: - artifactName: "coverage_rpt.txt" - targetPath: '$(Build.SourcesDirectory)/build_nnapi/Debug/coverage_rpt.txt' - publishLocation: 'pipeline' - - script: /bin/bash tools/ci_build/github/linux/ort_minimal/nnapi_minimal_build_minimal_ort_and_run_tests.sh $(pwd) # Build Minimal ORT with NNAPI and reduced Ops, run unit tests on Android Emulator displayName: Build Minimal ORT with NNAPI and run tests - - template: templates/use-android-emulator.yml - parameters: - stop: true - - - template: templates/clean-agent-build-directory-step.yml - - - job: Update_Dashboard - workspace: - clean: all - variables: - - name: skipComponentGovernanceDetection - value: true - pool: 'onnxruntime-Ubuntu2204-AMD-CPU' - condition: and(succeeded(), in(variables['Build.Reason'], 'IndividualCI', 'BatchedCI')) - dependsOn: - - NNAPI_EP_MASTER - steps: - - task: DownloadPipelineArtifact@0 - displayName: 'Download code coverage report' - inputs: - artifactName: 'coverage_rpt.txt' - targetPath: '$(Build.BinariesDirectory)' - - task: AzureCLI@2 displayName: 'Post Android Code Coverage To DashBoard' inputs: azureSubscription: AIInfraBuild scriptType: bash scriptPath: $(Build.SourcesDirectory)/tools/ci_build/github/linux/upload_code_coverage_data.sh - arguments: '"$(Build.BinariesDirectory)/coverage_rpt.txt" "https://dev.azure.com/onnxruntime/onnxruntime/_build/results?buildId=$(Build.BuildId)" arm android nnapi' + arguments: '"$(Build.SourcesDirectory)/build_nnapi/Debug/coverage_rpt.txt" "https://dev.azure.com/onnxruntime/onnxruntime/_build/results?buildId=$(Build.BuildId)" arm android nnapi' workingDirectory: '$(Build.BinariesDirectory)' + - template: templates/use-android-emulator.yml + parameters: + stop: true + - template: templates/clean-agent-build-directory-step.yml diff --git a/tools/ci_build/github/azure-pipelines/bigmodels-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/bigmodels-ci-pipeline.yml index 41b3c47ba0396..a66828ee5e188 100644 --- a/tools/ci_build/github/azure-pipelines/bigmodels-ci-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/bigmodels-ci-pipeline.yml @@ -43,7 +43,7 @@ variables: - name: docker_base_image value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11:20240531.1 - name: linux_trt_version - value: 10.0.1.6-1.cuda11.8 + value: 10.2.0.19-1.cuda11.8 - name: Repository value: 'onnxruntimecuda11manylinuxbuild' diff --git a/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml b/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml index 8b386dde7d3a7..700326fe9173c 100644 --- a/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml +++ b/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml @@ -83,7 +83,7 @@ variables: value: 11.8 - name: win_trt_home - value: $(Agent.TempDirectory)\TensorRT-10.0.1.6.Windows10.x86_64.cuda-11.8 + value: $(Agent.TempDirectory)\TensorRT-10.2.0.19.Windows10.x86_64.cuda-11.8 - name: win_cuda_home value: $(Agent.TempDirectory)\v11.8 diff --git a/tools/ci_build/github/azure-pipelines/cuda-packaging-pipeline.yml b/tools/ci_build/github/azure-pipelines/cuda-packaging-pipeline.yml index daf95af438d2b..9fd13b513e5fd 100644 --- a/tools/ci_build/github/azure-pipelines/cuda-packaging-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/cuda-packaging-pipeline.yml @@ -68,9 +68,9 @@ variables: value: nvidia/cuda:12.2.2-cudnn8-devel-ubi8 - name: win_trt_home ${{ if eq(parameters.CudaVersion, '11.8') }}: - value: $(Agent.TempDirectory)\TensorRT-10.0.1.6.Windows10.x86_64.cuda-11.8 + value: $(Agent.TempDirectory)\TensorRT-10.2.0.19.Windows10.x86_64.cuda-11.8 ${{ if eq(parameters.CudaVersion, '12.2') }}: - value: $(Agent.TempDirectory)\TensorRT-10.0.1.6.Windows10.x86_64.cuda-12.4 + value: $(Agent.TempDirectory)\TensorRT-10.2.0.19.Windows10.x86_64.cuda-12.5 - name: win_cuda_home ${{ if eq(parameters.CudaVersion, '11.8') }}: value: $(Agent.TempDirectory)\v11.8 diff --git a/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-ci-pipeline.yml index 5f63339fb0d00..3f9707ff50519 100644 --- a/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-ci-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-ci-pipeline.yml @@ -43,9 +43,9 @@ variables: value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_ubi8_gcc12:20240610.1 - name: linux_trt_version ${{ if eq(parameters.CudaVersion, '11.8') }}: - value: 10.0.1.6-1.cuda11.8 + value: 10.2.0.19-1.cuda11.8 ${{ if eq(parameters.CudaVersion, '12.2') }}: - value: 10.0.1.6-1.cuda12.4 + value: 10.2.0.19-1.cuda12.5 jobs: - job: Linux_Build diff --git a/tools/ci_build/github/azure-pipelines/linux-migraphx-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-migraphx-ci-pipeline.yml index f36cd9cfbfca1..6bf6324252fb9 100644 --- a/tools/ci_build/github/azure-pipelines/linux-migraphx-ci-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/linux-migraphx-ci-pipeline.yml @@ -36,7 +36,7 @@ variables: - name: render value: 109 - name: RocmVersion - value: 6.0 + value: 6.1 - name: RocmVersionPatchSuffix value: ".3" diff --git a/tools/ci_build/github/azure-pipelines/nuget/templates/test_linux.yml b/tools/ci_build/github/azure-pipelines/nuget/templates/test_linux.yml index b9a5383836447..56e9c73a10a82 100644 --- a/tools/ci_build/github/azure-pipelines/nuget/templates/test_linux.yml +++ b/tools/ci_build/github/azure-pipelines/nuget/templates/test_linux.yml @@ -61,7 +61,7 @@ stages: ${{ if eq(parameters.CudaVersion, '12.2') }}: DockerBuildArgs: " --build-arg BASEIMAGE=nvidia/cuda:12.2.2-devel-ubuntu20.04 - --build-arg TRT_VERSION=10.0.1.6-1+cuda12.4 + --build-arg TRT_VERSION=10.2.0.19-1+cuda12.5 --build-arg BUILD_UID=$( id -u ) " ${{ else }}: diff --git a/tools/ci_build/github/azure-pipelines/orttraining-pai-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/orttraining-pai-ci-pipeline.yml index 001062452644e..0e1afdcc5b8ca 100644 --- a/tools/ci_build/github/azure-pipelines/orttraining-pai-ci-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/orttraining-pai-ci-pipeline.yml @@ -25,7 +25,7 @@ variables: - name: render value: 109 - name: RocmVersion - value: 6.0 + value: 6.1 - name: RocmVersionPatchSuffix value: ".3" - name: BuildConfig diff --git a/tools/ci_build/github/azure-pipelines/post-merge-jobs.yml b/tools/ci_build/github/azure-pipelines/post-merge-jobs.yml index d6a3fa3147a47..593d45361324e 100644 --- a/tools/ci_build/github/azure-pipelines/post-merge-jobs.yml +++ b/tools/ci_build/github/azure-pipelines/post-merge-jobs.yml @@ -226,7 +226,7 @@ stages: BuildConfig: 'RelWithDebInfo' EnvSetupScript: setup_env_trt.bat buildArch: x64 - additionalBuildFlags: --enable_pybind --build_java --build_nodejs --use_cuda --cuda_home="$(Agent.TempDirectory)\v11.8" --enable_cuda_profiling --use_tensorrt --tensorrt_home="$(Agent.TempDirectory)\TensorRT-10.0.1.6.Windows10.x86_64.cuda-11.8" --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86 + additionalBuildFlags: --enable_pybind --build_java --build_nodejs --use_cuda --cuda_home="$(Agent.TempDirectory)\v11.8" --enable_cuda_profiling --use_tensorrt --tensorrt_home="$(Agent.TempDirectory)\TensorRT-10.2.0.19.Windows10.x86_64.cuda-11.8" --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86 msbuildPlatform: x64 isX86: false job_name_suffix: x64_RelWithDebInfo @@ -446,14 +446,15 @@ stages: python tools/ci_build/github/apple/build_apple_framework.py \ --build_dir "$(Build.BinariesDirectory)/ios_framework" \ --build_dynamic_framework \ - tools/ci_build/github/apple/default_full_apple_framework_build_settings.json + tools/ci_build/github/apple/test_ios_framework_build_settings.json displayName: "Build iOS dynamic framework" - script: | python tools/ci_build/github/apple/test_apple_packages.py \ --framework_info_file "$(Build.BinariesDirectory)/ios_framework/xcframework_info.json" \ --c_framework_dir "$(Build.BinariesDirectory)/ios_framework/framework_out" \ - --variant Full + --variant Full \ + --skip_macos_test displayName: "Test pod with iOS framework" - stage: IosMinimalTrainingBuild diff --git a/tools/ci_build/github/azure-pipelines/py-package-test-pipeline.yml b/tools/ci_build/github/azure-pipelines/py-package-test-pipeline.yml index 63e70fa8e6488..d57a7585f3cff 100644 --- a/tools/ci_build/github/azure-pipelines/py-package-test-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/py-package-test-pipeline.yml @@ -55,7 +55,7 @@ stages: python_wheel_suffix: '_gpu' timeout: 480 docker_base_image: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11:20240531.1 - trt_version: '10.0.1.6-1.cuda11.8' + trt_version: '10.2.0.19-1.cuda11.8' cuda_version: '11.8' diff --git a/tools/ci_build/github/azure-pipelines/stages/jobs/py-linux-cuda-package-test-job.yml b/tools/ci_build/github/azure-pipelines/stages/jobs/py-linux-cuda-package-test-job.yml index b6943f9e1b77b..7dfafeb67acf8 100644 --- a/tools/ci_build/github/azure-pipelines/stages/jobs/py-linux-cuda-package-test-job.yml +++ b/tools/ci_build/github/azure-pipelines/stages/jobs/py-linux-cuda-package-test-job.yml @@ -49,9 +49,9 @@ jobs: value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_ubi8_gcc12:20240610.1 - name: linux_trt_version ${{ if eq(parameters.CudaVersion, '11.8') }}: - value: 10.0.1.6-1.cuda11.8 + value: 10.2.0.19-1.cuda11.8 ${{ if eq(parameters.CudaVersion, '12.2') }}: - value: 10.0.1.6-1.cuda12.4 + value: 10.2.0.19-1.cuda12.5 pool: ${{ parameters.machine_pool }} steps: - checkout: self diff --git a/tools/ci_build/github/azure-pipelines/stages/nuget-linux-cuda-packaging-stage.yml b/tools/ci_build/github/azure-pipelines/stages/nuget-linux-cuda-packaging-stage.yml index cca53e36ebab9..2ca5129ac6e5d 100644 --- a/tools/ci_build/github/azure-pipelines/stages/nuget-linux-cuda-packaging-stage.yml +++ b/tools/ci_build/github/azure-pipelines/stages/nuget-linux-cuda-packaging-stage.yml @@ -80,9 +80,9 @@ stages: - name: linux_trt_version ${{ if eq(parameters.CudaVersion, '11.8') }}: - value: 10.0.1.6-1.cuda11.8 + value: 10.2.0.19-1.cuda11.8 ${{ if eq(parameters.CudaVersion, '12.2') }}: - value: 10.0.1.6-1.cuda12.4 + value: 10.2.0.19-1.cuda12.5 steps: - checkout: self clean: true @@ -149,9 +149,9 @@ stages: value: '12' - name: linux_trt_version ${{ if eq(parameters.CudaVersion, '11.8') }}: - value: 10.0.1.6-1.cuda11.8 + value: 10.2.0.19-1.cuda11.8 ${{ if eq(parameters.CudaVersion, '12.2') }}: - value: 10.0.1.6-1.cuda12.4 + value: 10.2.0.19-1.cuda12.5 steps: - checkout: self # due to checkout multiple repos, the root directory is $(Build.SourcesDirectory)/onnxruntime submodules: false diff --git a/tools/ci_build/github/azure-pipelines/stages/py-cuda-packaging-stage.yml b/tools/ci_build/github/azure-pipelines/stages/py-cuda-packaging-stage.yml index 01f0337be7714..dcd681bd4b915 100644 --- a/tools/ci_build/github/azure-pipelines/stages/py-cuda-packaging-stage.yml +++ b/tools/ci_build/github/azure-pipelines/stages/py-cuda-packaging-stage.yml @@ -65,9 +65,9 @@ stages: SpecificArtifact: ${{ parameters.SpecificArtifact }} BuildId: ${{ parameters.BuildId }} ${{ if eq(parameters.cuda_version, '11.8') }}: - EP_BUILD_FLAGS: --enable_lto --use_tensorrt --tensorrt_home=$(Agent.TempDirectory)\TensorRT-10.0.1.6.Windows10.x86_64.cuda-11.8 --cuda_home=$(Agent.TempDirectory)\v11.8 --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=52;60;61;70;75;80" + EP_BUILD_FLAGS: --enable_lto --use_tensorrt --tensorrt_home=$(Agent.TempDirectory)\TensorRT-10.2.0.19.Windows10.x86_64.cuda-11.8 --cuda_home=$(Agent.TempDirectory)\v11.8 --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=52;60;61;70;75;80" ${{ if eq(parameters.cuda_version, '12.2') }}: - EP_BUILD_FLAGS: --enable_lto --use_tensorrt --tensorrt_home=$(Agent.TempDirectory)\TensorRT-10.0.1.6.Windows10.x86_64.cuda-12.4 --cuda_home=$(Agent.TempDirectory)\v12.2 --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=52;60;61;70;75;80" + EP_BUILD_FLAGS: --enable_lto --use_tensorrt --tensorrt_home=$(Agent.TempDirectory)\TensorRT-10.2.0.19.Windows10.x86_64.cuda-12.5 --cuda_home=$(Agent.TempDirectory)\v12.2 --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=52;60;61;70;75;80" - ${{ if eq(parameters.enable_linux_gpu, true) }}: - template: ../templates/py-linux-gpu.yml @@ -79,7 +79,7 @@ stages: cuda_version: ${{ parameters.cuda_version }} ${{ if eq(parameters.cuda_version, '11.8') }}: docker_base_image: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11:20240531.1 - trt_version: 10.0.1.6-1.cuda11.8 + trt_version: 10.2.0.19-1.cuda11.8 ${{ if eq(parameters.cuda_version, '12.2') }}: docker_base_image: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_ubi8_gcc12:20240610.1 - trt_version: 10.0.1.6-1.cuda12.4 + trt_version: 10.2.0.19-1.cuda12.5 diff --git a/tools/ci_build/github/azure-pipelines/templates/jobs/download_win_gpu_library.yml b/tools/ci_build/github/azure-pipelines/templates/jobs/download_win_gpu_library.yml index 0dd9ffd5282e7..de29a3de9fded 100644 --- a/tools/ci_build/github/azure-pipelines/templates/jobs/download_win_gpu_library.yml +++ b/tools/ci_build/github/azure-pipelines/templates/jobs/download_win_gpu_library.yml @@ -13,10 +13,10 @@ parameters: - 12.2 - name: TrtVersion type: string - default: '10.0.1.6' + default: '10.2.0.19' values: - 8.6.1.6 - - 10.0.1.6 + - 10.2.0.19 steps: - ${{ if eq(parameters.DownloadCUDA, true) }}: @@ -42,9 +42,9 @@ steps: - powershell: | Write-Host "##vso[task.setvariable variable=trtCudaVersion;]12.0" displayName: Set trtCudaVersion - - ${{ if and(eq(parameters.CudaVersion, '12.2'), eq(parameters.TrtVersion, '10.0.1.6')) }}: + - ${{ if and(eq(parameters.CudaVersion, '12.2'), eq(parameters.TrtVersion, '10.2.0.19')) }}: - powershell: | - Write-Host "##vso[task.setvariable variable=trtCudaVersion;]12.4" + Write-Host "##vso[task.setvariable variable=trtCudaVersion;]12.5" displayName: Set trtCudaVersion - script: | diff --git a/tools/ci_build/github/azure-pipelines/templates/jobs/set-winenv.yml b/tools/ci_build/github/azure-pipelines/templates/jobs/set-winenv.yml index 6c82958fc0b78..63d521f1e7d9a 100644 --- a/tools/ci_build/github/azure-pipelines/templates/jobs/set-winenv.yml +++ b/tools/ci_build/github/azure-pipelines/templates/jobs/set-winenv.yml @@ -24,17 +24,11 @@ steps: displayName: 'Download Secondary CUDA SDK v${{ parameters.SecondaryCUDAVersion }}' - ${{ if eq(parameters.DownloadTRT, 'true') }}: - powershell: | - azcopy.exe cp --recursive "https://lotusscus.blob.core.windows.net/models/local/TensorRT-8.6.1.6.Windows10.x86_64.cuda-11.8" $(Agent.TempDirectory) - displayName: 'Download TensorRT-8.6.1.6.Windows10.x86_64.cuda-11.8' + azcopy.exe cp --recursive "https://lotusscus.blob.core.windows.net/models/local/TensorRT-10.2.0.19.Windows10.x86_64.cuda-11.8" $(Agent.TempDirectory) + displayName: 'Download TensorRT-10.2.0.19.Windows10.x86_64.cuda-11.8' - powershell: | - azcopy.exe cp --recursive "https://lotusscus.blob.core.windows.net/models/local/TensorRT-8.6.1.6.Windows10.x86_64.cuda-12.0" $(Agent.TempDirectory) - displayName: 'Download TensorRT-8.6.1.6.Windows10.x86_64.cuda-12.0' - - powershell: | - azcopy.exe cp --recursive "https://lotusscus.blob.core.windows.net/models/local/TensorRT-10.0.1.6.Windows10.x86_64.cuda-11.8" $(Agent.TempDirectory) - displayName: 'Download TensorRT-10.0.1.6.Windows10.x86_64.cuda-11.8' - - powershell: | - azcopy.exe cp --recursive "https://lotusscus.blob.core.windows.net/models/local/TensorRT-10.0.1.6.Windows10.x86_64.cuda-12.4" $(Agent.TempDirectory) - displayName: 'Download TensorRT-10.0.1.6.Windows10.x86_64.cuda-12.4' + azcopy.exe cp --recursive "https://lotusscus.blob.core.windows.net/models/local/TensorRT-10.2.0.19.Windows10.x86_64.cuda-12.5" $(Agent.TempDirectory) + displayName: 'Download TensorRT-10.2.0.19.Windows10.x86_64.cuda-12.5' - task: BatchScript@1 displayName: 'setup env' diff --git a/tools/ci_build/github/azure-pipelines/templates/py-linux-gpu.yml b/tools/ci_build/github/azure-pipelines/templates/py-linux-gpu.yml index 97f95797be1f1..6c66cceb33d5c 100644 --- a/tools/ci_build/github/azure-pipelines/templates/py-linux-gpu.yml +++ b/tools/ci_build/github/azure-pipelines/templates/py-linux-gpu.yml @@ -22,10 +22,10 @@ parameters: - name: trt_version type: string - default: '10.0.1.6-1.cuda11.8' + default: '10.2.0.19-1.cuda11.8' values: - - 10.0.1.6-1.cuda11.8 - - 10.0.1.6-1.cuda12.4 + - 10.2.0.19-1.cuda11.8 + - 10.2.0.19-1.cuda12.5 - name: cuda_version type: string default: '11.8' diff --git a/tools/ci_build/github/azure-pipelines/templates/py-packaging-linux-test-cuda.yml b/tools/ci_build/github/azure-pipelines/templates/py-packaging-linux-test-cuda.yml index 3081624225b12..8eca22c8c123f 100644 --- a/tools/ci_build/github/azure-pipelines/templates/py-packaging-linux-test-cuda.yml +++ b/tools/ci_build/github/azure-pipelines/templates/py-packaging-linux-test-cuda.yml @@ -18,10 +18,10 @@ parameters: - name: trt_version type: string - default: '10.0.1.6-1.cuda11.8' + default: '10.2.0.19-1.cuda11.8' values: - - 10.0.1.6-1.cuda11.8 - - 10.0.1.6-1.cuda12.4 + - 10.2.0.19-1.cuda11.8 + - 10.2.0.19-1.cuda12.5 - name: cuda_version type: string default: '11.8' diff --git a/tools/ci_build/github/azure-pipelines/templates/py-packaging-selectable-stage.yml b/tools/ci_build/github/azure-pipelines/templates/py-packaging-selectable-stage.yml index 3f1c4ef0f8d61..47980955b8798 100644 --- a/tools/ci_build/github/azure-pipelines/templates/py-packaging-selectable-stage.yml +++ b/tools/ci_build/github/azure-pipelines/templates/py-packaging-selectable-stage.yml @@ -381,7 +381,7 @@ stages: variables: CUDA_VERSION: '11.8' buildArch: x64 - EpBuildFlags: --use_tensorrt --tensorrt_home="$(Agent.TempDirectory)\TensorRT-10.0.1.6.Windows10.x86_64.cuda-11.8" --cuda_version=$(CUDA_VERSION) --cuda_home="C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v$(CUDA_VERSION)" --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=37;50;52;60;61;70;75;80" + EpBuildFlags: --use_tensorrt --tensorrt_home="$(Agent.TempDirectory)\TensorRT-10.2.0.19.Windows10.x86_64.cuda-11.8" --cuda_version=$(CUDA_VERSION) --cuda_home="C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v$(CUDA_VERSION)" --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=37;50;52;60;61;70;75;80" EnvSetupScript: setup_env_gpu.bat EP_NAME: gpu VSGenerator: 'Visual Studio 17 2022' diff --git a/tools/ci_build/github/azure-pipelines/templates/py-packaging-stage.yml b/tools/ci_build/github/azure-pipelines/templates/py-packaging-stage.yml index 9e14789f3b234..27f85dc5c1648 100644 --- a/tools/ci_build/github/azure-pipelines/templates/py-packaging-stage.yml +++ b/tools/ci_build/github/azure-pipelines/templates/py-packaging-stage.yml @@ -288,7 +288,7 @@ stages: parameters: MACHINE_POOL: 'onnxruntime-Win2022-GPU-A10' PYTHON_VERSION: '3.8' - EP_BUILD_FLAGS: --use_tensorrt --tensorrt_home="$(Agent.TempDirectory)\TensorRT-10.0.1.6.Windows10.x86_64.cuda-11.8" --cuda_home="$(Agent.TempDirectory)\v11.8" --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=52;60;61;70;75;80" + EP_BUILD_FLAGS: --use_tensorrt --tensorrt_home="$(Agent.TempDirectory)\TensorRT-10.2.0.19.Windows10.x86_64.cuda-11.8" --cuda_home="$(Agent.TempDirectory)\v11.8" --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=52;60;61;70;75;80" ENV_SETUP_SCRIPT: setup_env_gpu.bat EP_NAME: gpu publish_symbols: ${{ parameters.publish_symbols }} @@ -298,7 +298,7 @@ stages: parameters: MACHINE_POOL: 'onnxruntime-Win2022-GPU-A10' PYTHON_VERSION: '3.9' - EP_BUILD_FLAGS: --use_tensorrt --tensorrt_home="$(Agent.TempDirectory)\TensorRT-10.0.1.6.Windows10.x86_64.cuda-11.8" --cuda_home="$(Agent.TempDirectory)\v11.8" --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=52;60;61;70;75;80" + EP_BUILD_FLAGS: --use_tensorrt --tensorrt_home="$(Agent.TempDirectory)\TensorRT-10.2.0.19.Windows10.x86_64.cuda-11.8" --cuda_home="$(Agent.TempDirectory)\v11.8" --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=52;60;61;70;75;80" ENV_SETUP_SCRIPT: setup_env_gpu.bat EP_NAME: gpu publish_symbols: ${{ parameters.publish_symbols }} @@ -308,7 +308,7 @@ stages: parameters: MACHINE_POOL: 'onnxruntime-Win2022-GPU-A10' PYTHON_VERSION: '3.10' - EP_BUILD_FLAGS: --use_tensorrt --tensorrt_home="$(Agent.TempDirectory)\TensorRT-10.0.1.6.Windows10.x86_64.cuda-11.8" --cuda_home="$(Agent.TempDirectory)\v11.8" --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=52;60;61;70;75;80" + EP_BUILD_FLAGS: --use_tensorrt --tensorrt_home="$(Agent.TempDirectory)\TensorRT-10.2.0.19.Windows10.x86_64.cuda-11.8" --cuda_home="$(Agent.TempDirectory)\v11.8" --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=52;60;61;70;75;80" ENV_SETUP_SCRIPT: setup_env_gpu.bat EP_NAME: gpu publish_symbols: ${{ parameters.publish_symbols }} @@ -318,7 +318,7 @@ stages: parameters: MACHINE_POOL: 'onnxruntime-Win2022-GPU-A10' PYTHON_VERSION: '3.11' - EP_BUILD_FLAGS: --use_tensorrt --tensorrt_home="$(Agent.TempDirectory)\TensorRT-10.0.1.6.Windows10.x86_64.cuda-11.8" --cuda_home="$(Agent.TempDirectory)\v11.8" --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=52;60;61;70;75;80" + EP_BUILD_FLAGS: --use_tensorrt --tensorrt_home="$(Agent.TempDirectory)\TensorRT-10.2.0.19.Windows10.x86_64.cuda-11.8" --cuda_home="$(Agent.TempDirectory)\v11.8" --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=52;60;61;70;75;80" ENV_SETUP_SCRIPT: setup_env_gpu.bat EP_NAME: gpu publish_symbols: ${{ parameters.publish_symbols }} @@ -328,7 +328,7 @@ stages: parameters: MACHINE_POOL: 'onnxruntime-Win2022-GPU-A10' PYTHON_VERSION: '3.12' - EP_BUILD_FLAGS: --use_tensorrt --tensorrt_home="$(Agent.TempDirectory)\TensorRT-10.0.1.6.Windows10.x86_64.cuda-11.8" --cuda_home="$(Agent.TempDirectory)\v11.8" --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=52;60;61;70;75;80" + EP_BUILD_FLAGS: --use_tensorrt --tensorrt_home="$(Agent.TempDirectory)\TensorRT-10.2.0.19.Windows10.x86_64.cuda-11.8" --cuda_home="$(Agent.TempDirectory)\v11.8" --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=52;60;61;70;75;80" ENV_SETUP_SCRIPT: setup_env_gpu.bat EP_NAME: gpu publish_symbols: ${{ parameters.publish_symbols }} @@ -498,7 +498,7 @@ stages: docker_base_image: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11:20240531.1 extra_build_arg: ${{ parameters.build_py_parameters }} cmake_build_type: ${{ parameters.cmake_build_type }} - trt_version: '10.0.1.6-1.cuda11.8' + trt_version: '10.2.0.19-1.cuda11.8' cuda_version: '11.8' - ${{ if eq(parameters.enable_windows_arm64_qnn, true) }}: diff --git a/tools/ci_build/github/azure-pipelines/templates/use-android-emulator.yml b/tools/ci_build/github/azure-pipelines/templates/use-android-emulator.yml index b31882c8da18f..4251a8401f8f0 100644 --- a/tools/ci_build/github/azure-pipelines/templates/use-android-emulator.yml +++ b/tools/ci_build/github/azure-pipelines/templates/use-android-emulator.yml @@ -15,6 +15,25 @@ parameters: steps: - ${{ if eq(parameters.create, true) }}: + - script: | + if [[ ":$PATH:" == *":${ANDROID_SDK_ROOT}/emulator:"* ]]; then + echo "${ANDROID_SDK_ROOT}/emulator is in PATH" + else + ${ANDROID_SDK_ROOT}/cmdline-tools/latest/bin/sdkmanager --install "emulator" + echo "##vso[task.prependpath]${ANDROID_SDK_ROOT}/emulator" + fi + displayName: Check if emulator are installed and add to PATH + + - script: | + if [[ ":$PATH:" == *":${ANDROID_SDK_ROOT}/platform-tools:"* ]]; then + echo "${ANDROID_SDK_ROOT}/platform-tools is in PATH" + else + ${ANDROID_SDK_ROOT}/cmdline-tools/latest/bin/sdkmanager --install "platform-tools" + echo "##vso[task.prependpath]${ANDROID_SDK_ROOT}/platform-tools" + fi + ls -R ${ANDROID_SDK_ROOT}/platform-tools + displayName: Check if platform tools are installed and add to PATH + - script: | set -e -x python3 tools/python/run_android_emulator.py \ diff --git a/tools/ci_build/github/azure-pipelines/win-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/win-ci-pipeline.yml index 39e68f5631f01..7d64f78c695fa 100644 --- a/tools/ci_build/github/azure-pipelines/win-ci-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/win-ci-pipeline.yml @@ -137,6 +137,25 @@ stages: WITH_CACHE: false MachinePool: 'onnxruntime-Win-CPU-2022' +# Build only. Does not run any tests. +- stage: x64_release_vitisai + dependsOn: [] + jobs: + - template: templates/jobs/win-ci-vs-2022-job.yml + parameters: + BuildConfig: 'RelWithDebInfo' + buildArch: x64 + additionalBuildFlags: --build_wheel --use_vitisai + msbuildPlatform: x64 + isX86: false + job_name_suffix: x64_release + RunOnnxRuntimeTests: false + isTraining: false + ORT_EP_NAME: VITISAI + GenerateDocumentation: false + WITH_CACHE: false + MachinePool: 'onnxruntime-Win-CPU-2022' + - stage: x64_release_winml dependsOn: [] jobs: diff --git a/tools/ci_build/github/azure-pipelines/win-gpu-tensorrt-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/win-gpu-tensorrt-ci-pipeline.yml index 1af00da01241a..70c0c7d4a04e7 100644 --- a/tools/ci_build/github/azure-pipelines/win-gpu-tensorrt-ci-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/win-gpu-tensorrt-ci-pipeline.yml @@ -55,7 +55,7 @@ jobs: WithCache: True Today: $(TODAY) AdditionalKey: "gpu-tensorrt | RelWithDebInfo" - BuildPyArguments: '--config RelWithDebInfo --parallel --use_binskim_compliant_compile_flags --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --build_shared_lib --update --cmake_generator "Visual Studio 17 2022" --build_wheel --enable_onnx_tests --use_tensorrt --tensorrt_home="$(Agent.TempDirectory)\TensorRT-10.0.1.6.Windows10.x86_64.cuda-11.8" --cuda_home="$(Agent.TempDirectory)\v11.8" --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86' + BuildPyArguments: '--config RelWithDebInfo --parallel --use_binskim_compliant_compile_flags --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --build_shared_lib --update --cmake_generator "Visual Studio 17 2022" --build_wheel --enable_onnx_tests --use_tensorrt --tensorrt_home="$(Agent.TempDirectory)\TensorRT-10.2.0.19.Windows10.x86_64.cuda-11.8" --cuda_home="$(Agent.TempDirectory)\v11.8" --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86' MsbuildArguments: $(MsbuildArguments) BuildArch: 'x64' Platform: 'x64' @@ -75,7 +75,7 @@ jobs: del wheel_filename_file python.exe -m pip install -q --upgrade %WHEEL_FILENAME% set PATH=$(Build.BinariesDirectory)\RelWithDebInfo\RelWithDebInfo;%PATH% - python $(Build.SourcesDirectory)\tools\ci_build\build.py --config RelWithDebInfo --use_binskim_compliant_compile_flags --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --build_shared_lib --test --cmake_generator "Visual Studio 17 2022" --build_wheel --enable_onnx_tests --use_tensorrt --tensorrt_home="$(Agent.TempDirectory)\TensorRT-10.0.1.6.Windows10.x86_64.cuda-11.8" --cuda_home="$(Agent.TempDirectory)\v11.8" --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=75 + python $(Build.SourcesDirectory)\tools\ci_build\build.py --config RelWithDebInfo --use_binskim_compliant_compile_flags --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --build_shared_lib --test --cmake_generator "Visual Studio 17 2022" --build_wheel --enable_onnx_tests --use_tensorrt --tensorrt_home="$(Agent.TempDirectory)\TensorRT-10.2.0.19.Windows10.x86_64.cuda-11.8" --cuda_home="$(Agent.TempDirectory)\v11.8" --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=75 workingDirectory: '$(Build.BinariesDirectory)\RelWithDebInfo\RelWithDebInfo' displayName: 'Run tests' diff --git a/tools/ci_build/github/linux/docker/Dockerfile.package_ubi8_cuda_tensorrt10_0 b/tools/ci_build/github/linux/docker/Dockerfile.package_ubi8_cuda_tensorrt10_0 index 86c178aae519b..2d3dc05285e3c 100644 --- a/tools/ci_build/github/linux/docker/Dockerfile.package_ubi8_cuda_tensorrt10_0 +++ b/tools/ci_build/github/linux/docker/Dockerfile.package_ubi8_cuda_tensorrt10_0 @@ -6,7 +6,7 @@ # Build base image with required system packages ARG BASEIMAGE=nvidia/cuda:11.8.0-cudnn8-devel-ubi8 -ARG TRT_VERSION=10.0.1.6-1.cuda11.8 +ARG TRT_VERSION=10.2.0.19-1.cuda11.8 FROM $BASEIMAGE AS base ARG TRT_VERSION ENV PATH /opt/python/cp38-cp38/bin:/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/src/tensorrt/bin:${PATH} diff --git a/tools/ci_build/github/linux/docker/Dockerfile.package_ubi8_cuda_tensorrt10_0_torch b/tools/ci_build/github/linux/docker/Dockerfile.package_ubi8_cuda_tensorrt10_0_torch index 4542d3a3f2e4c..a50788e98ffe0 100644 --- a/tools/ci_build/github/linux/docker/Dockerfile.package_ubi8_cuda_tensorrt10_0_torch +++ b/tools/ci_build/github/linux/docker/Dockerfile.package_ubi8_cuda_tensorrt10_0_torch @@ -6,7 +6,7 @@ # Build base image with required system packages ARG BASEIMAGE=nvidia/cuda:11.8.0-cudnn8-devel-ubi8 -ARG TRT_VERSION=10.0.1.6-1.cuda11.8 +ARG TRT_VERSION=10.2.0.19-1.cuda11.8 FROM $BASEIMAGE AS base ARG TRT_VERSION ENV PATH /opt/python/cp38-cp38/bin:/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/src/tensorrt/bin:${PATH} diff --git a/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2004_gpu b/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2004_gpu index 5ef56fd885ca7..1aca3e305452d 100644 --- a/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2004_gpu +++ b/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2004_gpu @@ -6,7 +6,7 @@ # Build base image with required system packages ARG BASEIMAGE=nvidia/cuda:11.8.0-cudnn8-devel-ubuntu20.04 -ARG TRT_VERSION=10.0.1.6-1+cuda11.8 +ARG TRT_VERSION=10.2.0.19-1+cuda11.8 ARG LD_LIBRARY_PATH_ARG=/usr/local/lib64:/usr/local/cuda/lib64 FROM $BASEIMAGE AS base ARG TRT_VERSION diff --git a/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2004_gpu_ffmpeg b/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2004_gpu_ffmpeg index 194a22850030c..5697120a48b2b 100644 --- a/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2004_gpu_ffmpeg +++ b/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2004_gpu_ffmpeg @@ -6,7 +6,7 @@ # Build base image with required system packages ARG BASEIMAGE=nvidia/cuda:11.8.0-cudnn8-devel-ubuntu20.04 -ARG TRT_VERSION=10.0.1.6-1+cuda11.8 +ARG TRT_VERSION=10.2.0.19-1+cuda11.8 ARG LD_LIBRARY_PATH_ARG=/usr/local/lib64:/usr/local/cuda/lib64 FROM $BASEIMAGE AS base ARG TRT_VERSION diff --git a/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_6_tensorrt8_4 b/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_6_tensorrt8_4 deleted file mode 100644 index 8b32425afce1c..0000000000000 --- a/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_6_tensorrt8_4 +++ /dev/null @@ -1,63 +0,0 @@ -# -------------------------------------------------------------- -# Copyright (c) Microsoft Corporation. All rights reserved. -# Licensed under the MIT License. -# -------------------------------------------------------------- -# Dockerfile to run ONNXRuntime with TensorRT integration - -FROM nvidia/cuda:11.6.1-cudnn8-devel-ubuntu20.04 - - -# ONNX Runtime Variables -ARG ONNXRUNTIME_REPO=https://github.com/Microsoft/onnxruntime -ARG ONNXRUNTIME_BRANCH=main -ARG CMAKE_CUDA_ARCHITECTURES=37;50;52;60;61;70;75;80 - -ENV PATH /usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/src/tensorrt/bin:/code/cmake-3.27.3-linux-x86_64/bin:/opt/miniconda/bin:${PATH} - -ENV DEBIAN_FRONTEND=noninteractive - -RUN apt-get update &&\ - apt-get install -y sudo git bash unattended-upgrades wget -RUN unattended-upgrade - -# Install python3 -RUN apt-get install -y --no-install-recommends \ - python3 \ - python3-pip \ - python3-dev \ - python3-wheel &&\ - cd /usr/local/bin &&\ - ln -s /usr/bin/python3 python &&\ - ln -s /usr/bin/pip3 pip; - -RUN pip install --upgrade pip -RUN pip install setuptools>=68.2.2 - -# Install TensorRT -RUN v="8.4.1-1+cuda11.6" &&\ - apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/7fa2af80.pub &&\ - apt-get update &&\ - sudo apt-get install -y libnvinfer8=${v} libnvonnxparsers8=${v} libnvparsers8=${v} libnvinfer-plugin8=${v} \ - libnvinfer-dev=${v} libnvonnxparsers-dev=${v} libnvparsers-dev=${v} libnvinfer-plugin-dev=${v} \ - python3-libnvinfer=${v} libnvinfer-samples=${v} - -# Compile trtexec -RUN cd /usr/src/tensorrt/samples/trtexec && make - -# Install Valgrind -RUN apt-get install -y valgrind - -ARG BUILD_USER=onnxruntimedev -ARG BUILD_UID=1000 -RUN adduser --gecos 'onnxruntime Build User' --disabled-password $BUILD_USER --uid $BUILD_UID -USER $BUILD_USER -WORKDIR /code -ENV CUDA_MODULE_LOADING "LAZY" - -# Prepare onnxruntime repository & build onnxruntime with TensorRT -RUN git clone --single-branch --branch ${ONNXRUNTIME_BRANCH} --recursive ${ONNXRUNTIME_REPO} onnxruntime &&\ - /bin/sh onnxruntime/dockerfiles/scripts/install_common_deps.sh &&\ - cd onnxruntime &&\ - /bin/sh build.sh --parallel --build_shared_lib --cuda_home /usr/local/cuda --cudnn_home /usr/lib/x86_64-linux-gnu/ --use_tensorrt --tensorrt_home /usr/lib/x86_64-linux-gnu/ --config Release --build_wheel --skip_tests --skip_submodule_sync --cmake_extra_defines '"CMAKE_CUDA_ARCHITECTURES='${CMAKE_CUDA_ARCHITECTURES}'"' &&\ - pip install /code/onnxruntime/build/Linux/Release/dist/*.whl &&\ - cd .. diff --git a/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_8_tensorrt8_5 b/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_8_tensorrt8_5 deleted file mode 100644 index cfc7023ef8e61..0000000000000 --- a/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_8_tensorrt8_5 +++ /dev/null @@ -1,92 +0,0 @@ -# -------------------------------------------------------------- -# Copyright (c) Microsoft Corporation. All rights reserved. -# Licensed under the MIT License. -# -------------------------------------------------------------- -# Dockerfile to run ONNXRuntime with TensorRT integration - -# Build base image with required system packages -FROM nvidia/cuda:11.8.0-cudnn8-devel-ubuntu20.04 AS base - -# The local directory into which to build and install CMAKE -ARG ONNXRUNTIME_LOCAL_CODE_DIR=/code - -ENV PATH /usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/src/tensorrt/bin:${ONNXRUNTIME_LOCAL_CODE_DIR}/cmake-3.27.3-linux-x86_64/bin:/opt/miniconda/bin:${PATH} -ENV DEBIAN_FRONTEND=noninteractive - -RUN apt-get update &&\ - apt-get install -y sudo git bash unattended-upgrades wget -RUN unattended-upgrade - -# Install python3 -RUN apt-get install -y --no-install-recommends \ - python3 \ - python3-pip \ - python3-dev \ - python3-wheel &&\ - cd /usr/local/bin &&\ - ln -s /usr/bin/python3 python &&\ - ln -s /usr/bin/pip3 pip; - -RUN pip install --upgrade pip -RUN pip install setuptools>=68.2.2 - -# Install TensorRT -RUN v="8.5.1-1+cuda11.8" &&\ - apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/7fa2af80.pub &&\ - apt-get update &&\ - sudo apt-get install -y libnvinfer8=${v} libnvonnxparsers8=${v} libnvparsers8=${v} libnvinfer-plugin8=${v} \ - libnvinfer-dev=${v} libnvonnxparsers-dev=${v} libnvparsers-dev=${v} libnvinfer-plugin-dev=${v} \ - python3-libnvinfer=${v} libnvinfer-samples=${v} - -# Compile trtexec -RUN cd /usr/src/tensorrt/samples/trtexec && make - -# Install Valgrind -RUN apt-get install -y valgrind - -# Build final image from base. Builds ORT. -FROM base as final -ARG BUILD_USER=onnxruntimedev -ARG BUILD_UID=1000 -RUN adduser --gecos 'onnxruntime Build User' --disabled-password $BUILD_USER --uid $BUILD_UID -USER $BUILD_USER - -# ONNX Runtime arguments - -# URL to the github repo from which to clone ORT. -ARG ONNXRUNTIME_REPO=https://github.com/Microsoft/onnxruntime - -# The local directory into which to clone ORT. -ARG ONNXRUNTIME_LOCAL_CODE_DIR=/code - -# The git branch of ORT to checkout and build. -ARG ONNXRUNTIME_BRANCH=main - -# Optional. The specific commit to pull and build from. If not set, the latest commit is used. -ARG ONNXRUNTIME_COMMIT_ID - -# The supported CUDA architecture -ARG CMAKE_CUDA_ARCHITECTURES=37;50;52;60;61;70;75;80 - -WORKDIR ${ONNXRUNTIME_LOCAL_CODE_DIR} - -# Clone ORT repository with branch -RUN git clone --single-branch --branch ${ONNXRUNTIME_BRANCH} --recursive ${ONNXRUNTIME_REPO} onnxruntime &&\ - /bin/sh onnxruntime/dockerfiles/scripts/install_common_deps.sh - -WORKDIR ${ONNXRUNTIME_LOCAL_CODE_DIR}/onnxruntime - -# Reset to a specific commit if specified by build args. -RUN if [ -z "$ONNXRUNTIME_COMMIT_ID" ] ; then echo "Building branch ${ONNXRUNTIME_BRANCH}" ;\ - else echo "Building branch ${ONNXRUNTIME_BRANCH} @ commit ${ONNXRUNTIME_COMMIT_ID}" &&\ - git reset --hard ${ONNXRUNTIME_COMMIT_ID} && git submodule update --recursive ; fi - -# Build ORT -ENV CUDA_MODULE_LOADING "LAZY" -RUN /bin/sh build.sh --parallel --build_shared_lib --cuda_home /usr/local/cuda --cudnn_home /usr/lib/x86_64-linux-gnu/ --use_tensorrt --tensorrt_home /usr/lib/x86_64-linux-gnu/ --config Release --build_wheel --skip_tests --skip_submodule_sync --cmake_extra_defines '"CMAKE_CUDA_ARCHITECTURES='${CMAKE_CUDA_ARCHITECTURES}'"' - -# Switch to root to continue following steps of CI -USER root - -# Intall ORT wheel -RUN pip install ${ONNXRUNTIME_LOCAL_CODE_DIR}/onnxruntime/build/Linux/Release/dist/*.whl \ No newline at end of file diff --git a/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_8_tensorrt10_0 b/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_tensorrt10 similarity index 99% rename from tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_8_tensorrt10_0 rename to tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_tensorrt10 index cd168e1911d95..0bd56a1a5873f 100644 --- a/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_8_tensorrt10_0 +++ b/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_tensorrt10 @@ -31,7 +31,7 @@ RUN pip install --upgrade pip RUN pip install psutil setuptools>=68.2.2 # Install TensorRT -RUN version="10.0.1.6-1+cuda11.8" &&\ +RUN version="10.2.0.19-1+cuda11.8" &&\ apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/7fa2af80.pub &&\ apt-get update &&\ apt-get install -y \ diff --git a/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda12_4_tensorrt10_0 b/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda12_tensorrt10 similarity index 83% rename from tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda12_4_tensorrt10_0 rename to tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda12_tensorrt10 index 3e48415118c63..7f66943dd8745 100644 --- a/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda12_4_tensorrt10_0 +++ b/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda12_tensorrt10 @@ -5,7 +5,7 @@ # Dockerfile to run ONNXRuntime with TensorRT integration # Build base image with required system packages -FROM nvidia/cuda:12.4.1-devel-ubuntu20.04 AS base +FROM nvidia/cuda:12.5.1-cudnn-devel-ubuntu20.04 AS base # The local directory into which to build and install CMAKE ARG ONNXRUNTIME_LOCAL_CODE_DIR=/code @@ -30,15 +30,27 @@ RUN apt-get install -y --no-install-recommends \ RUN pip install --upgrade pip RUN pip install setuptools>=68.2.2 psutil -# Install cuDNN v9 -RUN apt-get -y install cudnn9-cuda-12 - # Install TensorRT -RUN version="10.0.1.6-1+cuda12.4" &&\ +RUN version="10.2.0.19-1+cuda12.5" &&\ apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/7fa2af80.pub &&\ apt-get update &&\ apt-get install -y \ - tensorrt=${version} + libnvinfer-dev=${version} \ + libnvinfer-dispatch-dev=${version} \ + libnvinfer-dispatch10=${version} \ + libnvinfer-headers-dev=${version} \ + libnvinfer-headers-plugin-dev=${version} \ + libnvinfer-lean-dev=${version} \ + libnvinfer-lean10=${version} \ + libnvinfer-plugin-dev=${version} \ + libnvinfer-plugin10=${version} \ + libnvinfer-vc-plugin-dev=${version} \ + libnvinfer-vc-plugin10=${version} \ + libnvinfer10=${version} \ + libnvonnxparsers-dev=${version} \ + libnvonnxparsers10=${version} \ + tensorrt-dev=${version} \ + libnvinfer-bin=${version} # Compile trtexec if not installed RUN if [ ! -d /usr/src/tensorrt/bin ] || [ ! -f /usr/src/tensorrt/bin/trtexec ]; then \ diff --git a/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_tensorrt_bin b/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_tensorrt_bin index a26bf88fbbdf6..0281c1c8fef25 100644 --- a/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_tensorrt_bin +++ b/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_tensorrt_bin @@ -5,7 +5,7 @@ # Dockerfile to run ONNXRuntime with TensorRT installed from provided binaries # Build base image with required system packages -FROM nvidia/cuda:12.3.1-devel-ubuntu20.04 AS base +FROM nvidia/cuda:12.5.1-cudnn-devel-ubuntu20.04 AS base # The local directory into which to build and install CMAKE ARG ONNXRUNTIME_LOCAL_CODE_DIR=/code @@ -30,9 +30,6 @@ RUN apt-get install -y --no-install-recommends \ RUN pip install --upgrade pip RUN pip install setuptools>=68.2.2 -# Install cuDNN v9 -RUN apt-get -y install cudnn9-cuda-12 - # Install TensorRT # Must provide version numbers used to build the name of the tar file containing TensorRT binaries. # See: https://docs.nvidia.com/deeplearning/tensorrt/install-guide/index.html#installing-tar diff --git a/tools/ci_build/github/linux/docker/inference/x86_64/python/cuda/Dockerfile b/tools/ci_build/github/linux/docker/inference/x86_64/python/cuda/Dockerfile index 3a7f410d3859e..a0020a9827290 100644 --- a/tools/ci_build/github/linux/docker/inference/x86_64/python/cuda/Dockerfile +++ b/tools/ci_build/github/linux/docker/inference/x86_64/python/cuda/Dockerfile @@ -5,7 +5,7 @@ ARG BASEIMAGE=nvidia/cuda:11.8.0-cudnn8-devel-ubi8 FROM $BASEIMAGE -ARG TRT_VERSION=10.0.1.6-1.cuda11.8 +ARG TRT_VERSION=10.2.0.19-1.cuda11.8 #Install TensorRT only if TRT_VERSION is not empty RUN if [ -n "${TRT_VERSION}" ]; then \ diff --git a/tools/ci_build/github/pai/rocm-ci-pipeline-env.Dockerfile b/tools/ci_build/github/pai/rocm-ci-pipeline-env.Dockerfile index b94826ae0e4bc..bf21a65314985 100644 --- a/tools/ci_build/github/pai/rocm-ci-pipeline-env.Dockerfile +++ b/tools/ci_build/github/pai/rocm-ci-pipeline-env.Dockerfile @@ -1,7 +1,7 @@ # Refer to https://github.com/RadeonOpenCompute/ROCm-docker/blob/master/dev/Dockerfile-ubuntu-22.04-complete FROM ubuntu:22.04 -ARG ROCM_VERSION=6.0 +ARG ROCM_VERSION=6.1 ARG AMDGPU_VERSION=${ROCM_VERSION} ARG APT_PREF='Package: *\nPin: release o=repo.radeon.com\nPin-Priority: 600' @@ -77,11 +77,7 @@ RUN ln -sf /usr/lib/x86_64-linux-gnu/libstdc++.so.6 ${CONDA_ENVIRONMENT_PATH}/bi RUN export MAJOR=$(cut -d '.' -f 1 <<< "$ROCM_VERSION") && \ export MINOR=$(cut -d '.' -f 2 <<< "$ROCM_VERSION") && \ export PATCH=$(cut -d '.' -f 3 <<< "$ROCM_VERSION") && \ - if (( MAJOR >= 6 )); then \ - pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/rocm${MAJOR}.${MINOR} ; \ - else \ - pip install torch==2.0.1 torchvision==0.15.2 -f https://repo.radeon.com/rocm/manylinux/rocm-rel-${MAJOR}.${MINOR}/ ; \ - fi && \ + pip install torch==2.1.2 torchvision==0.16.1 -f https://repo.radeon.com/rocm/manylinux/rocm-rel-${MAJOR}.${MINOR}/ && \ pip install torch-ort --no-dependencies ##### Install Cupy to decrease CPU utilization diff --git a/tools/ci_build/github/windows/post_to_dashboard/requirements.txt b/tools/ci_build/github/windows/post_to_dashboard/requirements.txt index b8c00a610b781..6ece3c1f92c4e 100644 --- a/tools/ci_build/github/windows/post_to_dashboard/requirements.txt +++ b/tools/ci_build/github/windows/post_to_dashboard/requirements.txt @@ -1,2 +1,2 @@ -azure-kusto-data[pandas]==3.0.1 -azure-kusto-ingest[pandas]==3.0.1 +azure-kusto-data[pandas]==4.5.1 +azure-kusto-ingest[pandas]==4.5.1 diff --git a/tools/ci_build/github/windows/setup_env_gpu.bat b/tools/ci_build/github/windows/setup_env_gpu.bat index b753cdae16b90..6c59866ea925a 100644 --- a/tools/ci_build/github/windows/setup_env_gpu.bat +++ b/tools/ci_build/github/windows/setup_env_gpu.bat @@ -6,10 +6,10 @@ if exist PATH=%AGENT_TEMPDIRECTORY%\v11.8\ ( ) else ( set PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.8\bin;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.8\extras\CUPTI\lib64;%PATH% ) -set PATH=%AGENT_TEMPDIRECTORY%\TensorRT-10.0.1.6.Windows10.x86_64.cuda-11.8\lib;%PATH% +set PATH=%AGENT_TEMPDIRECTORY%\TensorRT-10.2.0.19.Windows10.x86_64.cuda-11.8\lib;%PATH% @REM The default version is still cuda v11.8, because set cuda v12.2 after it -set PATH=%PATH%;%AGENT_TEMPDIRECTORY%\TensorRT-10.0.1.6.Windows10.x86_64.cuda-12.4\lib +set PATH=%PATH%;%AGENT_TEMPDIRECTORY%\TensorRT-10.2.0.19.Windows10.x86_64.cuda-12.5\lib if exist PATH=%AGENT_TEMPDIRECTORY%\v12.2\ ( set PATH=%PATH%;%AGENT_TEMPDIRECTORY%\v12.2\bin;%AGENT_TEMPDIRECTORY%\v12.2\extras\CUPTI\lib64 ) else ( diff --git a/tools/ci_build/github/windows/setup_env_trt.bat b/tools/ci_build/github/windows/setup_env_trt.bat index 4e43b5999a315..249bb98815897 100644 --- a/tools/ci_build/github/windows/setup_env_trt.bat +++ b/tools/ci_build/github/windows/setup_env_trt.bat @@ -6,6 +6,6 @@ if exist PATH=%AGENT_TEMPDIRECTORY%\v11.8\ ( ) else ( set PATH=%PATH%;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.8\bin;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.8\extras\CUPTI\lib64 ) -set PATH=%AGENT_TEMPDIRECTORY%\TensorRT-10.0.1.6.Windows10.x86_64.cuda-11.8\lib;%PATH% +set PATH=%AGENT_TEMPDIRECTORY%\TensorRT-10.2.0.19.Windows10.x86_64.cuda-11.8\lib;%PATH% set GRADLE_OPTS=-Dorg.gradle.daemon=false set CUDA_MODULE_LOADING=LAZY \ No newline at end of file