From 780735098d8ecc90f0dc74a442e448f90c227d45 Mon Sep 17 00:00:00 2001 From: Yulong Wang <7679871+fs-eire@users.noreply.github.com> Date: Thu, 19 Dec 2024 10:23:27 -0800 Subject: [PATCH 1/4] [nodejs binding] Fix building in latest clang (#23146) ### Description This change fixes the build break for Node.js binding on latest AppleClang: ``` ...tensor_helper.cc:65:5 error: integer value -1 is outside of the valid range of values [0,15] for the enumeration type 'napi_typedarray_type' [-Wenum-constexpr-conversion] ``` Use the underlying type of enum `napi_typedarray_type` for `DATA_TYPE_TYPEDARRAY_MAP` to solve this issue. Because the underlying type is implementation defined (it's `int` for MSVC and `unsigned int` for Clang), we use `std::underlying_type_t` to get the correct type. --- js/node/src/tensor_helper.cc | 55 ++++++++++++++++++++++-------------- 1 file changed, 34 insertions(+), 21 deletions(-) diff --git a/js/node/src/tensor_helper.cc b/js/node/src/tensor_helper.cc index 27eb9b65c62d3..12b1a79793ff3 100644 --- a/js/node/src/tensor_helper.cc +++ b/js/node/src/tensor_helper.cc @@ -53,24 +53,24 @@ constexpr size_t DATA_TYPE_ELEMENT_SIZE_MAP[] = { static_assert(sizeof(DATA_TYPE_ELEMENT_SIZE_MAP) == sizeof(size_t) * ONNX_TENSOR_ELEMENT_DATA_TYPE_COUNT, "definition not matching"); -constexpr napi_typedarray_type DATA_TYPE_TYPEDARRAY_MAP[] = { - (napi_typedarray_type)(-1), // ONNX_TENSOR_ELEMENT_DATA_TYPE_UNDEFINED not supported - napi_float32_array, // ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT - napi_uint8_array, // ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT8 - napi_int8_array, // ONNX_TENSOR_ELEMENT_DATA_TYPE_INT8 - napi_uint16_array, // ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT16 - napi_int16_array, // ONNX_TENSOR_ELEMENT_DATA_TYPE_INT16 - napi_int32_array, // ONNX_TENSOR_ELEMENT_DATA_TYPE_INT32 - napi_bigint64_array, // ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64 - (napi_typedarray_type)(-1), // ONNX_TENSOR_ELEMENT_DATA_TYPE_STRING not supported - napi_uint8_array, // ONNX_TENSOR_ELEMENT_DATA_TYPE_BOOL - napi_uint16_array, // ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16 FLOAT16 uses Uint16Array - napi_float64_array, // ONNX_TENSOR_ELEMENT_DATA_TYPE_DOUBLE - napi_uint32_array, // ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT32 - napi_biguint64_array, // ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT64 - (napi_typedarray_type)(-1), // ONNX_TENSOR_ELEMENT_DATA_TYPE_COMPLEX64 not supported - (napi_typedarray_type)(-1), // ONNX_TENSOR_ELEMENT_DATA_TYPE_COMPLEX128 not supported - (napi_typedarray_type)(-1) // ONNX_TENSOR_ELEMENT_DATA_TYPE_BFLOAT16 not supported +constexpr std::underlying_type_t DATA_TYPE_TYPEDARRAY_MAP[] = { + std::underlying_type_t(-1), // ONNX_TENSOR_ELEMENT_DATA_TYPE_UNDEFINED not supported + napi_float32_array, // ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT + napi_uint8_array, // ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT8 + napi_int8_array, // ONNX_TENSOR_ELEMENT_DATA_TYPE_INT8 + napi_uint16_array, // ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT16 + napi_int16_array, // ONNX_TENSOR_ELEMENT_DATA_TYPE_INT16 + napi_int32_array, // ONNX_TENSOR_ELEMENT_DATA_TYPE_INT32 + napi_bigint64_array, // ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64 + std::underlying_type_t(-1), // ONNX_TENSOR_ELEMENT_DATA_TYPE_STRING not supported + napi_uint8_array, // ONNX_TENSOR_ELEMENT_DATA_TYPE_BOOL + napi_uint16_array, // ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16 FLOAT16 uses Uint16Array + napi_float64_array, // ONNX_TENSOR_ELEMENT_DATA_TYPE_DOUBLE + napi_uint32_array, // ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT32 + napi_biguint64_array, // ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT64 + std::underlying_type_t(-1), // ONNX_TENSOR_ELEMENT_DATA_TYPE_COMPLEX64 not supported + std::underlying_type_t(-1), // ONNX_TENSOR_ELEMENT_DATA_TYPE_COMPLEX128 not supported + std::underlying_type_t(-1) // ONNX_TENSOR_ELEMENT_DATA_TYPE_BFLOAT16 not supported }; static_assert(sizeof(DATA_TYPE_TYPEDARRAY_MAP) == sizeof(napi_typedarray_type) * ONNX_TENSOR_ELEMENT_DATA_TYPE_COUNT, "definition not matching"); @@ -98,7 +98,20 @@ static_assert(sizeof(DATA_TYPE_ID_TO_NAME_MAP) == sizeof(const char*) * ONNX_TEN "definition not matching"); const std::unordered_map DATA_TYPE_NAME_TO_ID_MAP = { - {"float32", ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT}, {"uint8", ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT8}, {"int8", ONNX_TENSOR_ELEMENT_DATA_TYPE_INT8}, {"uint16", ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT16}, {"int16", ONNX_TENSOR_ELEMENT_DATA_TYPE_INT16}, {"int32", ONNX_TENSOR_ELEMENT_DATA_TYPE_INT32}, {"int64", ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64}, {"string", ONNX_TENSOR_ELEMENT_DATA_TYPE_STRING}, {"bool", ONNX_TENSOR_ELEMENT_DATA_TYPE_BOOL}, {"float16", ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16}, {"float64", ONNX_TENSOR_ELEMENT_DATA_TYPE_DOUBLE}, {"uint32", ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT32}, {"uint64", ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT64}}; + {"float32", ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT}, + {"uint8", ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT8}, + {"int8", ONNX_TENSOR_ELEMENT_DATA_TYPE_INT8}, + {"uint16", ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT16}, + {"int16", ONNX_TENSOR_ELEMENT_DATA_TYPE_INT16}, + {"int32", ONNX_TENSOR_ELEMENT_DATA_TYPE_INT32}, + {"int64", ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64}, + {"string", ONNX_TENSOR_ELEMENT_DATA_TYPE_STRING}, + {"bool", ONNX_TENSOR_ELEMENT_DATA_TYPE_BOOL}, + {"float16", ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16}, + {"float64", ONNX_TENSOR_ELEMENT_DATA_TYPE_DOUBLE}, + {"uint32", ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT32}, + {"uint64", ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT64}, +}; // currently only support tensor Ort::Value NapiValueToOrtValue(Napi::Env env, Napi::Value value, OrtMemoryInfo* cpu_memory_info, OrtMemoryInfo* webgpu_memory_info) { @@ -181,7 +194,7 @@ Ort::Value NapiValueToOrtValue(Napi::Env env, Napi::Value value, OrtMemoryInfo* "Tensor.data must be a typed array for numeric tensor."); auto tensorDataTypedArray = tensorDataValue.As(); - auto typedArrayType = tensorDataValue.As().TypedArrayType(); + std::underlying_type_t typedArrayType = tensorDataValue.As().TypedArrayType(); ORT_NAPI_THROW_TYPEERROR_IF(DATA_TYPE_TYPEDARRAY_MAP[elemType] != typedArrayType, env, "Tensor.data must be a typed array (", DATA_TYPE_TYPEDARRAY_MAP[elemType], ") for ", tensorTypeString, " tensors, but got typed array (", typedArrayType, ")."); @@ -294,7 +307,7 @@ Napi::Value OrtValueToNapiValue(Napi::Env env, Ort::Value&& value) { } napi_value typedArrayData; napi_status status = - napi_create_typedarray(env, DATA_TYPE_TYPEDARRAY_MAP[elemType], size, arrayBuffer, 0, &typedArrayData); + napi_create_typedarray(env, (napi_typedarray_type)DATA_TYPE_TYPEDARRAY_MAP[elemType], size, arrayBuffer, 0, &typedArrayData); NAPI_THROW_IF_FAILED(env, status, Napi::Value); // new Tensor(type, typedArrayData, dims) From 8680244ebc4457ac3fef7bb504d3560259766ae6 Mon Sep 17 00:00:00 2001 From: Yulong Wang <7679871+fs-eire@users.noreply.github.com> Date: Thu, 19 Dec 2024 10:23:48 -0800 Subject: [PATCH 2/4] Fix delay load for WebGPU EP and DML EP (#23111) ### Description This change fixes the DLL delay load problem for the WebGPU EP and DirectML EP. See detailed explanation below. ### Problem When onnxruntime.dll uses delay loading for its dependencies, the dependencies are loaded using `LoadLibraryEx()`, which search the directory of process (.exe) instead of this library (onnxruntime.dll). This is a problem for usages of Node.js binding and python binding, because Windows will try to find the dependencies in the directory of node.exe or python.exe, which is not the directory of onnxruntime.dll. There was previous attempt to fix this by loading DirectML.dll in the initialization of onnxruntime nodejs binding, which works for DML EP but is not a good solution because it does not really "delay" the load. For WebGPU, the situation became worse because webgpu_dawn.dll depends on dxil.dll and dxcompiler.dll, which are explicitly dynamically loaded in the code using `LoadLibraryA()`. This has the same problem of the DLL search. ### Solutions For onnxruntime.dll loading its direct dependencies, it can be resolved by set the [`__pfnDliNotifyHook2` hook](https://learn.microsoft.com/en-us/cpp/build/reference/understanding-the-helper-function?view=msvc-170#structure-and-constant-definitions) to load from an absolute path that constructed from the onnxruntime.dll folder and the DLL name. For webgpu_dawn.dll loading dxil.dll and dxcompiler.dll, since they are explicitly loaded in the code, the hook does not work. Instead, it can be resolved by ~~using WIN32 API `SetDllDirectory()` to add the onnxruntime.dll folder to the search path.~~ preloading the 2 DLLs from the onnxruntime.dll folder . --- cmake/onnxruntime.cmake | 1 + cmake/onnxruntime_nodejs.cmake | 20 ++- cmake/onnxruntime_providers_webgpu.cmake | 36 +++-- cmake/onnxruntime_unittests.cmake | 12 ++ js/node/CMakeLists.txt | 10 +- js/node/script/build.ts | 5 + js/node/src/directml_load_helper.cc | 37 ----- js/node/src/directml_load_helper.h | 6 - js/node/src/inference_session_wrap.cc | 4 - onnxruntime/core/dll/delay_load_hook.cc | 83 ++++++++++ onnxruntime/core/dll/dllmain.cc | 2 +- .../core/providers/webgpu/webgpu_context.cc | 26 ++++ .../core/providers/webgpu/webgpu_context.h | 3 + onnxruntime/test/webgpu/delay_load/main.cc | 142 ++++++++++++++++++ onnxruntime/test/webgpu/external_dawn/main.cc | 1 - .../win-gpu-webgpu-ci-pipeline.yml | 2 +- 16 files changed, 324 insertions(+), 66 deletions(-) delete mode 100644 js/node/src/directml_load_helper.cc delete mode 100644 js/node/src/directml_load_helper.h create mode 100644 onnxruntime/core/dll/delay_load_hook.cc create mode 100644 onnxruntime/test/webgpu/delay_load/main.cc diff --git a/cmake/onnxruntime.cmake b/cmake/onnxruntime.cmake index 732c0511d400f..d72b61a0859b2 100644 --- a/cmake/onnxruntime.cmake +++ b/cmake/onnxruntime.cmake @@ -77,6 +77,7 @@ if(WIN32) onnxruntime_add_shared_library(onnxruntime ${SYMBOL_FILE} "${ONNXRUNTIME_ROOT}/core/dll/dllmain.cc" + "${ONNXRUNTIME_ROOT}/core/dll/delay_load_hook.cc" "${ONNXRUNTIME_ROOT}/core/dll/onnxruntime.rc" ) elseif(onnxruntime_BUILD_APPLE_FRAMEWORK) diff --git a/cmake/onnxruntime_nodejs.cmake b/cmake/onnxruntime_nodejs.cmake index 376d895be34a9..355575be3bcf7 100644 --- a/cmake/onnxruntime_nodejs.cmake +++ b/cmake/onnxruntime_nodejs.cmake @@ -60,15 +60,26 @@ else() endif() endif() +# a list of DLLs that the Node.js binding depends on +set(NODEJS_DLL_DEPS) + # setup providers if (onnxruntime_USE_CUDA) set(NODEJS_BINDING_USE_CUDA "--use_cuda") endif() if (onnxruntime_USE_DML) set(NODEJS_BINDING_USE_DML "--use_dml") + list(APPEND NODEJS_DLL_DEPS "$/DirectML.dll") endif() if (onnxruntime_USE_WEBGPU) set(NODEJS_BINDING_USE_WEBGPU "--use_webgpu") + if (WIN32 AND onnxruntime_ENABLE_DAWN_BACKEND_D3D12) + list(APPEND NODEJS_DLL_DEPS "$/dxil.dll") + list(APPEND NODEJS_DLL_DEPS "$/dxcompiler.dll") + endif() + if (onnxruntime_BUILD_DAWN_MONOLITHIC_LIBRARY) + list(APPEND NODEJS_DLL_DEPS "$") + endif() endif() if (onnxruntime_USE_TENSORRT) set(NODEJS_BINDING_USE_TENSORRT "--use_tensorrt") @@ -94,9 +105,12 @@ add_custom_target(js_common_npm_ci ALL add_custom_target(nodejs_binding_wrapper ALL COMMAND ${NPM_CLI} ci - COMMAND ${NPM_CLI} run build -- --onnxruntime-build-dir=${CMAKE_CURRENT_BINARY_DIR} --config=${CMAKE_BUILD_TYPE} --onnxruntime-generator=${CMAKE_GENERATOR} - --arch=${NODEJS_BINDING_ARCH} ${NODEJS_BINDING_USE_CUDA} ${NODEJS_BINDING_USE_DML} ${NODEJS_BINDING_USE_WEBGPU} ${NODEJS_BINDING_USE_TENSORRT} - ${NODEJS_BINDING_USE_COREML} ${NODEJS_BINDING_USE_QNN} + COMMAND ${NPM_CLI} run build -- "--onnxruntime-build-dir=${CMAKE_CURRENT_BINARY_DIR}" + --config=${CMAKE_BUILD_TYPE} + "--onnxruntime-generator=${CMAKE_GENERATOR}" + "--dll_deps=${NODEJS_DLL_DEPS}" + --arch=${NODEJS_BINDING_ARCH} ${NODEJS_BINDING_USE_CUDA} ${NODEJS_BINDING_USE_DML} ${NODEJS_BINDING_USE_WEBGPU} + ${NODEJS_BINDING_USE_TENSORRT} ${NODEJS_BINDING_USE_COREML} ${NODEJS_BINDING_USE_QNN} WORKING_DIRECTORY ${JS_NODE_ROOT} COMMENT "Using cmake-js to build OnnxRuntime Node.js binding") diff --git a/cmake/onnxruntime_providers_webgpu.cmake b/cmake/onnxruntime_providers_webgpu.cmake index fea5964f0dda9..e527d538d8757 100644 --- a/cmake/onnxruntime_providers_webgpu.cmake +++ b/cmake/onnxruntime_providers_webgpu.cmake @@ -23,19 +23,18 @@ onnxruntime_add_include_to_target(onnxruntime_providers_webgpu onnxruntime_common dawn::dawncpp_headers dawn::dawn_headers onnx onnx_proto flatbuffers::flatbuffers Boost::mp11 safeint_interface) + set(onnxruntime_providers_webgpu_dll_deps) + if (onnxruntime_BUILD_DAWN_MONOLITHIC_LIBRARY) target_link_libraries(onnxruntime_providers_webgpu dawn::webgpu_dawn) - if (onnxruntime_ENABLE_DELAY_LOADING_WIN_DLLS) - list(APPEND onnxruntime_DELAYLOAD_FLAGS "/DELAYLOAD:webgpu_dawn.dll") - endif() + if (WIN32) + if (onnxruntime_ENABLE_DELAY_LOADING_WIN_DLLS) + list(APPEND onnxruntime_DELAYLOAD_FLAGS "/DELAYLOAD:webgpu_dawn.dll") + endif() - # Copy webgpu_dawn.dll to the output directory - add_custom_command( - TARGET onnxruntime_providers_webgpu - POST_BUILD - COMMAND ${CMAKE_COMMAND} -E copy_if_different "$" "$" - VERBATIM ) + list(APPEND onnxruntime_providers_webgpu_dll_deps "$") + endif() else() if (NOT onnxruntime_USE_EXTERNAL_DAWN) target_link_libraries(onnxruntime_providers_webgpu dawn::dawn_native) @@ -43,4 +42,23 @@ target_link_libraries(onnxruntime_providers_webgpu dawn::dawn_proc) endif() + if (WIN32 AND onnxruntime_ENABLE_DAWN_BACKEND_D3D12) + # Ensure dxil.dll and dxcompiler.dll exist in the output directory $ + add_dependencies(onnxruntime_providers_webgpu copy_dxil_dll) + add_dependencies(onnxruntime_providers_webgpu dxcompiler) + + list(APPEND onnxruntime_providers_webgpu_dll_deps "$/dxil.dll") + list(APPEND onnxruntime_providers_webgpu_dll_deps "$/dxcompiler.dll") + endif() + + if (onnxruntime_providers_webgpu_dll_deps) + # Copy dependency DLLs to the output directory + add_custom_command( + TARGET onnxruntime_providers_webgpu + POST_BUILD + COMMAND ${CMAKE_COMMAND} -E copy_if_different "${onnxruntime_providers_webgpu_dll_deps}" "$" + COMMAND_EXPAND_LISTS + VERBATIM ) + endif() + set_target_properties(onnxruntime_providers_webgpu PROPERTIES FOLDER "ONNXRuntime") diff --git a/cmake/onnxruntime_unittests.cmake b/cmake/onnxruntime_unittests.cmake index e822f0a3655fc..9e3ab4d41f416 100644 --- a/cmake/onnxruntime_unittests.cmake +++ b/cmake/onnxruntime_unittests.cmake @@ -525,6 +525,9 @@ set (onnxruntime_global_thread_pools_test_SRC set (onnxruntime_webgpu_external_dawn_test_SRC ${TEST_SRC_DIR}/webgpu/external_dawn/main.cc) +set (onnxruntime_webgpu_delay_load_test_SRC + ${TEST_SRC_DIR}/webgpu/delay_load/main.cc) + # tests from lowest level library up. # the order of libraries should be maintained, with higher libraries being added first in the list @@ -1864,4 +1867,13 @@ if (onnxruntime_USE_WEBGPU AND onnxruntime_USE_EXTERNAL_DAWN) onnxruntime_add_include_to_target(onnxruntime_webgpu_external_dawn_test dawn::dawncpp_headers dawn::dawn_headers) endif() +if (onnxruntime_USE_WEBGPU AND WIN32 AND onnxruntime_BUILD_SHARED_LIB AND NOT CMAKE_SYSTEM_NAME STREQUAL "Emscripten" AND NOT onnxruntime_MINIMAL_BUILD) + AddTest(DYN + TARGET onnxruntime_webgpu_delay_load_test + SOURCES ${onnxruntime_webgpu_delay_load_test_SRC} + LIBS ${SYS_PATH_LIB} + DEPENDS ${all_dependencies} + ) +endif() + include(onnxruntime_fuzz_test.cmake) diff --git a/js/node/CMakeLists.txt b/js/node/CMakeLists.txt index d79a82c572dc2..c78b40a3e7429 100644 --- a/js/node/CMakeLists.txt +++ b/js/node/CMakeLists.txt @@ -113,10 +113,12 @@ endif() if (WIN32) file(COPY ${ONNXRUNTIME_WIN_BIN_DIR}/onnxruntime.dll DESTINATION ${dist_folder}) - if (USE_DML) - file(COPY ${ONNXRUNTIME_WIN_BIN_DIR}/DirectML.dll - DESTINATION ${dist_folder}) - endif () + if (ORT_NODEJS_DLL_DEPS) + foreach(dll ${ORT_NODEJS_DLL_DEPS}) + file(COPY ${dll} DESTINATION ${dist_folder}) + endforeach() + endif() + elseif (APPLE) file(COPY ${ONNXRUNTIME_BUILD_DIR}/libonnxruntime.dylib DESTINATION ${dist_folder} FOLLOW_SYMLINK_CHAIN) diff --git a/js/node/script/build.ts b/js/node/script/build.ts index dcdcb93377b4c..b557368ed58c6 100644 --- a/js/node/script/build.ts +++ b/js/node/script/build.ts @@ -39,6 +39,8 @@ const USE_TENSORRT = !!buildArgs.use_tensorrt; const USE_COREML = !!buildArgs.use_coreml; // --use_qnn const USE_QNN = !!buildArgs.use_qnn; +// --dll_deps= +const DLL_DEPS = buildArgs.dll_deps; // build path const ROOT_FOLDER = path.join(__dirname, '..'); @@ -82,6 +84,9 @@ if (USE_COREML) { if (USE_QNN) { args.push('--CDUSE_QNN=ON'); } +if (DLL_DEPS) { + args.push(`--CDORT_NODEJS_DLL_DEPS=${DLL_DEPS}`); +} // set CMAKE_OSX_ARCHITECTURES for macOS build if (os.platform() === 'darwin') { diff --git a/js/node/src/directml_load_helper.cc b/js/node/src/directml_load_helper.cc deleted file mode 100644 index 6aafe4d5fa788..0000000000000 --- a/js/node/src/directml_load_helper.cc +++ /dev/null @@ -1,37 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#ifdef _WIN32 -#include "common.h" -#include "windows.h" - -void LoadDirectMLDll(Napi::Env env) { - DWORD pathLen = MAX_PATH; - std::wstring path(pathLen, L'\0'); - HMODULE moduleHandle = nullptr; - - GetModuleHandleEx(GET_MODULE_HANDLE_EX_FLAG_FROM_ADDRESS | GET_MODULE_HANDLE_EX_FLAG_UNCHANGED_REFCOUNT, - reinterpret_cast(&LoadDirectMLDll), &moduleHandle); - - DWORD getModuleFileNameResult = GetModuleFileNameW(moduleHandle, const_cast(path.c_str()), pathLen); - while (getModuleFileNameResult == 0 || getModuleFileNameResult == pathLen) { - int ret = GetLastError(); - if (ret == ERROR_INSUFFICIENT_BUFFER && pathLen < 32768) { - pathLen *= 2; - path.resize(pathLen); - getModuleFileNameResult = GetModuleFileNameW(moduleHandle, const_cast(path.c_str()), pathLen); - } else { - ORT_NAPI_THROW_ERROR(env, "Failed getting path to load DirectML.dll, error code: ", ret); - } - } - - path.resize(path.rfind(L'\\') + 1); - path.append(L"DirectML.dll"); - HMODULE libraryLoadResult = LoadLibraryW(path.c_str()); - - if (!libraryLoadResult) { - int ret = GetLastError(); - ORT_NAPI_THROW_ERROR(env, "Failed loading bundled DirectML.dll, error code: ", ret); - } -} -#endif diff --git a/js/node/src/directml_load_helper.h b/js/node/src/directml_load_helper.h deleted file mode 100644 index 074a4f95ed476..0000000000000 --- a/js/node/src/directml_load_helper.h +++ /dev/null @@ -1,6 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#if defined(USE_DML) && defined(_WIN32) -void LoadDirectMLDll(Napi::Env env); -#endif diff --git a/js/node/src/inference_session_wrap.cc b/js/node/src/inference_session_wrap.cc index 23d859351f426..04ab71dc48ec2 100644 --- a/js/node/src/inference_session_wrap.cc +++ b/js/node/src/inference_session_wrap.cc @@ -4,7 +4,6 @@ #include "onnxruntime_cxx_api.h" #include "common.h" -#include "directml_load_helper.h" #include "inference_session_wrap.h" #include "run_options_helper.h" #include "session_options_helper.h" @@ -19,9 +18,6 @@ Napi::FunctionReference& InferenceSessionWrap::GetTensorConstructor() { } Napi::Object InferenceSessionWrap::Init(Napi::Env env, Napi::Object exports) { -#if defined(USE_DML) && defined(_WIN32) - LoadDirectMLDll(env); -#endif // create ONNX runtime env Ort::InitApi(); ORT_NAPI_THROW_ERROR_IF( diff --git a/onnxruntime/core/dll/delay_load_hook.cc b/onnxruntime/core/dll/delay_load_hook.cc new file mode 100644 index 0000000000000..23fc8bca7368e --- /dev/null +++ b/onnxruntime/core/dll/delay_load_hook.cc @@ -0,0 +1,83 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +// == workaround for delay loading of dependencies of onnxruntime.dll == +// +// Problem: +// +// When onnxruntime.dll uses delay loading for its dependencies, the dependencies are loaded using LoadLibraryEx, +// which search the directory of process (.exe) instead of this library (onnxruntime.dll). This is a problem for +// usages of Node.js binding and python binding, because Windows will try to find the dependencies in the directory +// of node.exe or python.exe, which is not the directory of onnxruntime.dll. +// +// Solution: +// +// By using the delay load hook `__pfnDliNotifyHook2`, we can intervene the loading procedure by loading from an +// absolute path. The absolute path is constructed by appending the name of the DLL to load to the directory of +// onnxruntime.dll. This way, we can ensure that the dependencies are loaded from the same directory as onnxruntime.dll. +// +// See also: +// - https://learn.microsoft.com/en-us/cpp/build/reference/understanding-the-helper-function?view=msvc-170#structure-and-constant-definitions +// - https://learn.microsoft.com/en-us/windows/win32/dlls/dynamic-link-library-search-order#alternate-search-order-for-unpackaged-apps +// +// The DLL DelayLoad hook is only enabled when the compiler is MSVC and at least one of the following is True: +// - both USE_WEBGPU and BUILD_DAWN_MONOLITHIC_LIBRARY are defined +// - USE_DML is defined +// +#define ORT_DELAY_LOAD_WEBGPU_DAWN_DLL (defined(USE_WEBGPU) && defined(BUILD_DAWN_MONOLITHIC_LIBRARY)) +#define ORT_DELAY_LOAD_DIRECTML_DLL defined(USE_DML) +#if defined(_MSC_VER) && (ORT_DELAY_LOAD_WEBGPU_DAWN_DLL || ORT_DELAY_LOAD_DIRECTML_DLL) + +#include +#include +#include +#include + +#include "core/platform/env.h" + +namespace { + +#define DEFINE_KNOWN_DLL(name) {#name ".dll", L#name L".dll"} + +constexpr struct { + const char* str; + const wchar_t* wstr; +} known_dlls[] = { +#if ORT_DELAY_LOAD_WEBGPU_DAWN_DLL + DEFINE_KNOWN_DLL(webgpu_dawn), +#endif +#if ORT_DELAY_LOAD_DIRECTML_DLL + DEFINE_KNOWN_DLL(DirectML), +#endif +}; +} // namespace + +FARPROC WINAPI delay_load_hook(unsigned dliNotify, PDelayLoadInfo pdli) { + if (dliNotify == dliNotePreLoadLibrary) { + for (size_t i = 0; i < _countof(known_dlls); ++i) { + if (_stricmp(pdli->szDll, known_dlls[i].str) == 0) { + // Try to load the DLL from the same directory as onnxruntime.dll + + // First, get the path to onnxruntime.dll + auto path = Env::Default().GetRuntimePath(); + if (path.empty()) { + // Failed to get the path to onnxruntime.dll. In this case, we will just return NULL and let the system + // search for the DLL in the default search order. + return NULL; + } + + // Append the name of the DLL. Now `path` is the absolute path to the DLL to load. + path.append(known_dlls[i].wstr); + + // Load the DLL + return FARPROC(LoadLibraryExW(path.c_str(), NULL, + LOAD_LIBRARY_SEARCH_DEFAULT_DIRS | LOAD_LIBRARY_SEARCH_DLL_LOAD_DIR)); + } + } + } + return NULL; +} + +extern "C" const PfnDliHook __pfnDliNotifyHook2 = delay_load_hook; + +#endif diff --git a/onnxruntime/core/dll/dllmain.cc b/onnxruntime/core/dll/dllmain.cc index 2e7bdafd0599f..ac5dcd9c96084 100644 --- a/onnxruntime/core/dll/dllmain.cc +++ b/onnxruntime/core/dll/dllmain.cc @@ -13,7 +13,7 @@ #pragma GCC diagnostic pop #endif -// dllmain.cpp : Defines the entry point for the DLL application. +// dllmain.cc : Defines the entry point for the DLL application. BOOL APIENTRY DllMain(HMODULE /*hModule*/, DWORD ul_reason_for_call, LPVOID /*lpReserved*/ diff --git a/onnxruntime/core/providers/webgpu/webgpu_context.cc b/onnxruntime/core/providers/webgpu/webgpu_context.cc index d66c2a79d28a8..c85a15017659c 100644 --- a/onnxruntime/core/providers/webgpu/webgpu_context.cc +++ b/onnxruntime/core/providers/webgpu/webgpu_context.cc @@ -10,6 +10,8 @@ #endif #include "core/common/common.h" +#include "core/common/path_string.h" +#include "core/platform/env.h" #include "core/providers/webgpu/compute_context.h" #include "core/providers/webgpu/webgpu_context.h" @@ -50,6 +52,30 @@ void WebGpuContext::Initialize(const WebGpuExecutionProviderInfo& webgpu_ep_info // Initialization.Step.2 - Create wgpu::Adapter if (adapter_ == nullptr) { +#if !defined(__EMSCRIPTEN__) && defined(_MSC_VER) && defined(DAWN_ENABLE_D3D12) && !defined(USE_EXTERNAL_DAWN) + // If we are using the D3D12 backend on Windows and the build does not use external Dawn, dxil.dll and dxcompiler.dll are required. + // + // Dawn will try to load them later, but if they are in the different directory to the executable, it may fail to find them. + // To avoid this issue, we try to load them from the same directory as current module (usually onnxruntime.dll). + auto runtime_path = Env::Default().GetRuntimePath(); + if (!runtime_path.empty()) { + Status status; + void* module_handle = nullptr; + + PathString dxil_path = runtime_path + ToPathString(L"dxil.dll"); + status = Env::Default().LoadDynamicLibrary(dxil_path, false, &module_handle); + if (status.IsOK() && module_handle != nullptr) { + modules_.Add(dxil_path, module_handle); + } + + PathString dxcompiler_path = runtime_path + ToPathString(L"dxcompiler.dll"); + status = Env::Default().LoadDynamicLibrary(dxcompiler_path, false, &module_handle); + if (status.IsOK() && module_handle != nullptr) { + modules_.Add(dxcompiler_path, module_handle); + } + } +#endif + wgpu::RequestAdapterOptions req_adapter_options = {}; wgpu::DawnTogglesDescriptor adapter_toggles_desc = {}; req_adapter_options.nextInChain = &adapter_toggles_desc; diff --git a/onnxruntime/core/providers/webgpu/webgpu_context.h b/onnxruntime/core/providers/webgpu/webgpu_context.h index be05b06523b9c..c41ef3e211264 100644 --- a/onnxruntime/core/providers/webgpu/webgpu_context.h +++ b/onnxruntime/core/providers/webgpu/webgpu_context.h @@ -13,6 +13,7 @@ #include #include "core/common/common.h" +#include "core/framework/library_handles.h" #include "core/providers/webgpu/webgpu_execution_provider.h" #include "core/providers/webgpu/buffer_manager.h" #include "core/providers/webgpu/program_manager.h" @@ -153,6 +154,8 @@ class WebGpuContext final { std::once_flag init_flag_; + LibraryHandles modules_; + wgpu::Instance instance_; wgpu::Adapter adapter_; wgpu::Device device_; diff --git a/onnxruntime/test/webgpu/delay_load/main.cc b/onnxruntime/test/webgpu/delay_load/main.cc new file mode 100644 index 0000000000000..f909b4a6916b4 --- /dev/null +++ b/onnxruntime/test/webgpu/delay_load/main.cc @@ -0,0 +1,142 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include +#include +#include +#include +#define ORT_API_MANUAL_INIT +#include "core/session/onnxruntime_cxx_api.h" + +// This program is to test the delay loading of onnxruntime.dll. +// +// To verify the delay loading actually works, we need to do the test in 2 steps: +// +// 1. Prepare a folder structure like below: +// +// ├── webgpu_delay_load_test_root (newly created folder) +// │ ├── dlls +// │ │ ├── onnxruntime.dll +// │ │ ├── webgpu_dawn.dll +// │ │ ├── dxil.dll +// │ │ └── dxcompiler.dll +// │ └── test.exe +// └── onnxruntime_webgpu_delay_load_test.exe (this binary) +// +// This folder structure ensures no DLLs are in the same folder as the executable (test.exe). +// +// 2. Launch the test binary from the root folder of the above structure. +// +// So, there are 2 modes of this program: +// 1. "Prepare" mode: Do the step 1 above. (default) +// 2. "Test" mode: Do the step 2 above. (specified by --test argument) + +int prepare_main(); +int test_main(); + +int wmain(int argc, wchar_t* argv[]) { + if (argc == 2 && wcscmp(argv[1], L"--test") == 0) { + return test_main(); + } else { + return prepare_main(); + } +} + +int prepare_main() { + std::wstring path_str(32768, L'\0'); + GetModuleFileNameW(NULL, path_str.data(), static_cast(path_str.size())); + + namespace fs = std::filesystem; + fs::path exe_full_path{path_str}; // /onnxruntime_webgpu_delay_load_test.exe + fs::path test_dir = exe_full_path.parent_path(); // / + fs::path exe_name = exe_full_path.filename(); // onnxruntime_webgpu_delay_load_test.exe + fs::path root_folder = test_dir / L"webgpu_delay_load_test_root\\"; // /webgpu_delay_load_test_root/ + fs::path dlls_folder = root_folder / L"dlls\\"; // /webgpu_delay_load_test_root/dlls/ + + // ensure the test folder exists and is empty + if (fs::exists(root_folder)) { + fs::remove_all(root_folder); + } + fs::create_directories(dlls_folder); + + fs::current_path(test_dir); + + // copy the required DLLs to the dlls folder + fs::copy_file(L"onnxruntime.dll", dlls_folder / L"onnxruntime.dll"); + fs::copy_file(L"dxil.dll", dlls_folder / L"dxil.dll"); + fs::copy_file(L"dxcompiler.dll", dlls_folder / L"dxcompiler.dll"); + if (fs::exists(L"webgpu_dawn.dll")) { + fs::copy_file(L"webgpu_dawn.dll", dlls_folder / L"webgpu_dawn.dll"); + } + + // copy the test binary to the root folder + fs::copy_file(exe_full_path, root_folder / L"test.exe"); + + // run "test.exe --test" from the test root folder + fs::current_path(root_folder); + return _wsystem(L"test.exe --test"); +} + +int run() { + Ort::Env env{nullptr}; + int retval = 0; + try { + env = Ort::Env{ORT_LOGGING_LEVEL_WARNING, "Default"}; + + // model is https://github.com/onnx/onnx/blob/v1.15.0/onnx/backend/test/data/node/test_abs/model.onnx + constexpr uint8_t MODEL_DATA[] = {8, 7, 18, 12, 98, 97, 99, 107, 101, 110, + 100, 45, 116, 101, 115, 116, 58, 73, 10, 11, + 10, 1, 120, 18, 1, 121, 34, 3, 65, 98, + 115, 18, 8, 116, 101, 115, 116, 95, 97, 98, + 115, 90, 23, 10, 1, 120, 18, 18, 10, 16, + 8, 1, 18, 12, 10, 2, 8, 3, 10, 2, + 8, 4, 10, 2, 8, 5, 98, 23, 10, 1, + 121, 18, 18, 10, 16, 8, 1, 18, 12, 10, + 2, 8, 3, 10, 2, 8, 4, 10, 2, 8, + 5, 66, 4, 10, 0, 16, 13}; + + Ort::SessionOptions session_options; + session_options.DisableMemPattern(); + std::unordered_map provider_options; + session_options.AppendExecutionProvider("WebGPU", provider_options); + Ort::Session session{env, MODEL_DATA, sizeof(MODEL_DATA), session_options}; + + // successfully initialized + std::cout << "Successfully initialized WebGPU EP." << std::endl; + retval = 0; + } catch (const std::exception& ex) { + std::cerr << ex.what() << std::endl; + + std::cerr << "Unexpected exception." << std::endl; + retval = -1; + } + + return retval; +} + +int test_main() { + HMODULE hModule = LoadLibraryA("dlls\\onnxruntime.dll"); + if (hModule == NULL) { + std::cout << "Failed to load dlls\\onnxruntime.dll" << std::endl; + return 1; + } + + int retval = 0; + + using OrtGetApiBaseFunction = decltype(&OrtGetApiBase); + auto fnOrtGetApiBase = (OrtGetApiBaseFunction)GetProcAddress(hModule, "OrtGetApiBase"); + if (fnOrtGetApiBase == NULL) { + std::cout << "Failed to get OrtGetApiBase" << std::endl; + retval = 1; + goto cleanup; + } + Ort::InitApi(fnOrtGetApiBase()->GetApi(ORT_API_VERSION)); + + retval = run(); + +cleanup: + if (hModule != NULL) { + FreeLibrary(hModule); + } + return retval; +} diff --git a/onnxruntime/test/webgpu/external_dawn/main.cc b/onnxruntime/test/webgpu/external_dawn/main.cc index ed8d2eab94ce9..1cb22b131d76b 100644 --- a/onnxruntime/test/webgpu/external_dawn/main.cc +++ b/onnxruntime/test/webgpu/external_dawn/main.cc @@ -1,5 +1,4 @@ // Copyright (c) Microsoft Corporation. All rights reserved. -// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates // Licensed under the MIT License. #include diff --git a/tools/ci_build/github/azure-pipelines/win-gpu-webgpu-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/win-gpu-webgpu-ci-pipeline.yml index 06f374afca57a..8460df2ec3799 100644 --- a/tools/ci_build/github/azure-pipelines/win-gpu-webgpu-ci-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/win-gpu-webgpu-ci-pipeline.yml @@ -48,7 +48,7 @@ stages: --enable_pybind --build_nodejs --use_webgpu - --cmake_extra_defines onnxruntime_BUILD_UNIT_TESTS=ON + --cmake_extra_defines onnxruntime_BUILD_UNIT_TESTS=ON onnxruntime_BUILD_DAWN_MONOLITHIC_LIBRARY=ON msbuildPlatform: x64 isX86: false job_name_suffix: x64_RelWithDebInfo From a3bb3f148768cea41b59a2860a57264e85398dc7 Mon Sep 17 00:00:00 2001 From: Yifan Li <109183385+yf711@users.noreply.github.com> Date: Thu, 19 Dec 2024 10:30:39 -0800 Subject: [PATCH 3/4] [TensorRT EP] New CIs to test TRT+minimal CUDA build (#23028) ### Description New CI: [Linux_TRT_Minimal_CUDA_Test_CI](https://dev.azure.com/onnxruntime/onnxruntime/_build?definitionId=230&_a=summary) and [Win_TRT_Minimal_CUDA_Test_CI ](https://dev.azure.com/onnxruntime/onnxruntime/_build?definitionId=231) Setting config for new CI to monitor if there's no issue to build ORT-TRTEP with minimal CUDA * yaml content is following Linux TRT CI yaml, with different build arg/cache name * build arg is following [[TensorRT EP] Enable a minimal CUDA EP compilation without kernels](https://github.com/microsoft/onnxruntime/pull/19052#issuecomment-1888066851) ### Motivation and Context Monitor if user is able to build ORT-TRTEP-minimalCUDA without any blocker (which takes ~30min to build) --- tools/ci_build/build.py | 2 + ...-gpu-tensorrt-cuda-minimal-ci-pipeline.yml | 108 ++++++++++++++++++ ...-gpu-tensorrt-cuda-minimal-ci-pipeline.yml | 86 ++++++++++++++ .../github/linux/build_tensorrt_ci.sh | 13 +++ 4 files changed, 209 insertions(+) create mode 100644 tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-cuda-minimal-ci-pipeline.yml create mode 100644 tools/ci_build/github/azure-pipelines/win-gpu-tensorrt-cuda-minimal-ci-pipeline.yml diff --git a/tools/ci_build/build.py b/tools/ci_build/build.py index 3527a89ca7a7b..53dcdc6e0c6fa 100644 --- a/tools/ci_build/build.py +++ b/tools/ci_build/build.py @@ -260,6 +260,7 @@ def convert_arg_line_to_args(self, arg_line): ) parser.add_argument("--disable_cuda_nhwc_ops", action="store_true", help="Disable CUDA NHWC ops in build.") + parser.add_argument("--enable_cuda_minimal_build", action="store_true", help="Enable CUDA minimal build.") # Python bindings parser.add_argument("--enable_pybind", action="store_true", help="Enable Python Bindings.") @@ -1093,6 +1094,7 @@ def generate_build_tree( "-Donnxruntime_DISABLE_FLOAT8_TYPES=" + ("ON" if disable_float8_types else "OFF"), "-Donnxruntime_DISABLE_SPARSE_TENSORS=" + ("ON" if disable_sparse_tensors else "OFF"), "-Donnxruntime_DISABLE_OPTIONAL_TYPE=" + ("ON" if disable_optional_type else "OFF"), + "-Donnxruntime_CUDA_MINIMAL=" + ("ON" if args.enable_cuda_minimal_build else "OFF"), ] if args.rv64: diff --git a/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-cuda-minimal-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-cuda-minimal-ci-pipeline.yml new file mode 100644 index 0000000000000..2a32dd1a62408 --- /dev/null +++ b/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-cuda-minimal-ci-pipeline.yml @@ -0,0 +1,108 @@ +##### start trigger Don't edit it manually, Please do edit set-trigger-rules.py #### +### please do rerun set-trigger-rules.py ### +trigger: + branches: + include: + - main + - rel-* + paths: + exclude: + - docs/** + - README.md + - CONTRIBUTING.md + - BUILD.md + - 'js/web' + - 'onnxruntime/core/providers/js' +pr: + branches: + include: + - main + - rel-* + paths: + exclude: + - docs/** + - README.md + - CONTRIBUTING.md + - BUILD.md + - 'js/web' + - 'onnxruntime/core/providers/js' +#### end trigger #### +parameters: + - name: CudaVersion + displayName: CUDA version + type: string + default: '12.2' + values: + - 11.8 + - 12.2 + +variables: + - template: templates/common-variables.yml + - name: docker_base_image + ${{ if eq(parameters.CudaVersion, '11.8') }}: + value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11:20241120.3 + ${{ if eq(parameters.CudaVersion, '12.2') }}: + value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_ubi8_gcc12:20241120.3 + - name: linux_trt_version + ${{ if eq(parameters.CudaVersion, '11.8') }}: + value: ${{ variables.linux_trt_version_cuda11 }} + ${{ if eq(parameters.CudaVersion, '12.2') }}: + value: ${{ variables.linux_trt_version_cuda12 }} + +jobs: +- job: Linux_Build + timeoutInMinutes: 180 + variables: + skipComponentGovernanceDetection: true + ALLOW_RELEASED_ONNX_OPSET_ONLY: '1' + ORT_CACHE_DIR: '$(Agent.TempDirectory)/ort/ccache' + TODAY: $[format('{0:dd}{0:MM}{0:yyyy}', pipeline.startTime)] + workspace: + clean: all + pool: onnxruntime-tensorrt-linuxbuild-T4 + steps: + - task: mspremier.PostBuildCleanup.PostBuildCleanup-task.PostBuildCleanup@3 + displayName: 'Clean Agent Directories' + condition: always() + + - checkout: self + clean: true + submodules: none + + - template: templates/get-docker-image-steps.yml + parameters: + Dockerfile: tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda + Context: tools/ci_build/github/linux/docker + DockerBuildArgs: " + --network=host + --build-arg BASEIMAGE=${{ variables.docker_base_image }} + --build-arg TRT_VERSION=${{ variables.linux_trt_version }} + --build-arg BUILD_UID=$( id -u ) + " + Repository: onnxruntimetensorrtcudaminimalbuild + + - template: templates/linux-build-step-with-cache.yml + parameters: + WithCache: true + Today: $(TODAY) + AdditionalKey: gpu_tensorrt_cuda_minimal + CacheDir: '$(ORT_CACHE_DIR)' + BuildStep: + - task: CmdLine@2 + inputs: + script: | + docker run --gpus all --rm \ + --volume /data/onnx:/data/onnx:ro \ + --volume $(Build.SourcesDirectory):/onnxruntime_src \ + --volume $(Build.BinariesDirectory):/build \ + --volume /data/models:/build/models:ro \ + --volume $HOME/.onnx:/home/onnxruntimedev/.onnx \ + --volume $(ORT_CACHE_DIR):/cache \ + -e ALLOW_RELEASED_ONNX_OPSET_ONLY=0 \ + -e NIGHTLY_BUILD \ + -e BUILD_BUILDNUMBER \ + -e CCACHE_DIR=/cache -w /onnxruntime_src \ + onnxruntimetensorrtcudaminimalbuild tools/ci_build/github/linux/build_tensorrt_ci.sh --cuda_minimal=ON + workingDirectory: $(Build.SourcesDirectory) + + - template: templates/explicitly-defined-final-tasks.yml diff --git a/tools/ci_build/github/azure-pipelines/win-gpu-tensorrt-cuda-minimal-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/win-gpu-tensorrt-cuda-minimal-ci-pipeline.yml new file mode 100644 index 0000000000000..c68ba01485db2 --- /dev/null +++ b/tools/ci_build/github/azure-pipelines/win-gpu-tensorrt-cuda-minimal-ci-pipeline.yml @@ -0,0 +1,86 @@ +##### start trigger Don't edit it manually, Please do edit set-trigger-rules.py #### +### please do rerun set-trigger-rules.py ### +trigger: + branches: + include: + - main + - rel-* + paths: + exclude: + - docs/** + - README.md + - CONTRIBUTING.md + - BUILD.md + - 'js/web' + - 'onnxruntime/core/providers/js' +pr: + branches: + include: + - main + - rel-* + paths: + exclude: + - docs/** + - README.md + - CONTRIBUTING.md + - BUILD.md + - 'js/web' + - 'onnxruntime/core/providers/js' +#### end trigger #### +parameters: +- name: CudaVersion + displayName: CUDA version + type: string + default: '12.2' + values: + - 11.8 + - 12.2 + +variables: + - template: templates/common-variables.yml + - name: win_trt_folder + ${{ if eq(parameters.CudaVersion, '11.8') }}: + value: ${{ variables.win_trt_folder_cuda11 }} + ${{ if eq(parameters.CudaVersion, '12.2') }}: + value: ${{ variables.win_trt_folder_cuda12 }} + +jobs: +- job: 'build' + pool: 'onnxruntime-Win2022-GPU-A10' + variables: + MsbuildArguments: '-detailedsummary -maxcpucount -consoleloggerparameters:PerformanceSummary' + EnvSetupScript: setup_env_trt.bat + skipComponentGovernanceDetection: true + TODAY: $[format('{0:dd}{0:MM}{0:yyyy}', pipeline.startTime)] + timeoutInMinutes: 150 + workspace: + clean: all + steps: + - template: templates/jobs/win-ci-prebuild-steps.yml + parameters: + EnvSetupScript: $(EnvSetupScript) + DownloadCUDA: true + DownloadTRT: true + BuildArch: 'x64' + BuildConfig: RelWithDebInfo + MachinePool: 'onnxruntime-Win2022-GPU-A10' + WithCache: true + Today: $(Today) + + - template: templates/jobs/win-ci-build-steps.yml + parameters: + WithCache: True + Today: $(TODAY) + AdditionalKey: "gpu_tensorrt_cuda_minimal | RelWithDebInfo" + BuildPyArguments: '--config RelWithDebInfo --parallel --use_binskim_compliant_compile_flags --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --build_shared_lib --update --cmake_generator "Visual Studio 17 2022" --build_wheel --enable_onnx_tests --use_tensorrt --tensorrt_home="$(Agent.TempDirectory)\${{ variables.win_trt_folder }}" --cuda_home="$(Agent.TempDirectory)\v${{ parameters.CudaVersion }}" --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86 --enable_cuda_minimal_build' + MsbuildArguments: $(MsbuildArguments) + BuildArch: 'x64' + Platform: 'x64' + BuildConfig: RelWithDebInfo + + - task: PythonScript@0 + displayName: 'Build wheel' + inputs: + scriptPath: '$(Build.SourcesDirectory)\setup.py' + arguments: 'bdist_wheel' + workingDirectory: '$(Build.BinariesDirectory)\RelWithDebInfo\RelWithDebInfo' diff --git a/tools/ci_build/github/linux/build_tensorrt_ci.sh b/tools/ci_build/github/linux/build_tensorrt_ci.sh index 5b206bc0a92d9..ccf7a6f4ea630 100755 --- a/tools/ci_build/github/linux/build_tensorrt_ci.sh +++ b/tools/ci_build/github/linux/build_tensorrt_ci.sh @@ -21,6 +21,19 @@ BUILD_ARGS=('--config' 'Release' "CMAKE_CUDA_ARCHITECTURES=75" "onnxruntime_BUILD_UNIT_TESTS=ON" "onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS=ON") + +# Parse external args +for arg in "$@"; do + case $arg in + --cuda_minimal=ON) + # Replace onnxruntime_BUILD_UNIT_TESTS=ON with OFF + BUILD_ARGS=("${BUILD_ARGS[@]/onnxruntime_BUILD_UNIT_TESTS=ON/onnxruntime_BUILD_UNIT_TESTS=OFF}") + BUILD_ARGS+=("--enable_cuda_minimal_build") + BUILD_ARGS+=("--skip_tests") + ;; + esac +done + if [ -x "$(command -v ninja)" ]; then BUILD_ARGS+=('--cmake_generator' 'Ninja') fi From d9d07ad8ae5c5fded75b307b2bd83ed3f44dd186 Mon Sep 17 00:00:00 2001 From: Yifan Li <109183385+yf711@users.noreply.github.com> Date: Thu, 19 Dec 2024 10:39:15 -0800 Subject: [PATCH 4/4] [TensorRT EP] support TensorRT 10.7-GA (#23011) ### Description Update CIs to TRT10.7 ### Motivation and Context --- cgmanifests/generated/cgmanifest.json | 2 +- cmake/deps.txt | 4 ++-- .../python/tools/tensorrt/perf/build/build_image.py | 8 ++++---- .../linux-gpu-tensorrt-daily-perf-pipeline.yml | 12 ++++++------ .../py-cuda-alt-package-test-pipeline.yml | 2 +- .../azure-pipelines/templates/common-variables.yml | 2 +- .../azure-pipelines/templates/download-deps.yml | 4 ++-- .../templates/jobs/download_win_gpu_library.yml | 6 +++--- .../azure-pipelines/templates/jobs/set-winenv.yml | 4 ++-- .../docker/Dockerfile.package_ubi8_cuda_tensorrt10_0 | 2 +- .../Dockerfile.package_ubi8_cuda_tensorrt10_0_torch | 2 +- .../linux/docker/Dockerfile.package_ubuntu_2004_gpu | 2 +- .../docker/Dockerfile.package_ubuntu_2204_gpu_ffmpeg | 2 +- .../docker/Dockerfile.package_ubuntu_2204_gpu_opencv | 2 +- .../linux/docker/Dockerfile.ubuntu_cuda11_tensorrt10 | 2 +- .../linux/docker/Dockerfile.ubuntu_cuda12_tensorrt10 | 2 +- .../docker/inference/x86_64/python/cuda/Dockerfile | 2 +- tools/ci_build/github/windows/setup_env_gpu.bat | 4 ++-- tools/ci_build/github/windows/setup_env_trt.bat | 2 +- 19 files changed, 33 insertions(+), 33 deletions(-) diff --git a/cgmanifests/generated/cgmanifest.json b/cgmanifests/generated/cgmanifest.json index 07dff50f9a3bd..ad4195f31aa7c 100644 --- a/cgmanifests/generated/cgmanifest.json +++ b/cgmanifests/generated/cgmanifest.json @@ -196,7 +196,7 @@ "component": { "type": "git", "git": { - "commitHash": "bc0d2e35909b8456abe32f3b30a49bb0c125e8b7", + "commitHash": "9c69a24bc2e20c8a511a4e6b06fd49639ec5300a", "repositoryUrl": "https://github.com/onnx/onnx-tensorrt.git" }, "comments": "onnx_tensorrt" diff --git a/cmake/deps.txt b/cmake/deps.txt index 21f9ee1701c46..04a306e0ee657 100644 --- a/cmake/deps.txt +++ b/cmake/deps.txt @@ -36,8 +36,8 @@ microsoft_wil;https://github.com/microsoft/wil/archive/refs/tags/v1.0.230629.1.z mimalloc;https://github.com/microsoft/mimalloc/archive/refs/tags/v2.1.1.zip;d5ee7d34223d0567892db5179849939c8769dc41 mp11;https://github.com/boostorg/mp11/archive/refs/tags/boost-1.82.0.zip;9bc9e01dffb64d9e0773b2e44d2f22c51aace063 onnx;https://github.com/onnx/onnx/archive/refs/tags/v1.16.1.zip;2eb9198bb352757d5ff13977cbe0634898e0837c -# Use the latest commit of 10.6-GA-ORT-DDS -onnx_tensorrt;https://github.com/onnx/onnx-tensorrt/archive/bc0d2e35909b8456abe32f3b30a49bb0c125e8b7.zip;f233ae871ad82c023da62e5dd620639f00bc2d15 +# Use the latest commit of 10.7-GA +onnx_tensorrt;https://github.com/onnx/onnx-tensorrt/archive/9c69a24bc2e20c8a511a4e6b06fd49639ec5300a.zip;ff1fe9af78eb129b4a4cdcb7450b7390b4436dd3 protobuf;https://github.com/protocolbuffers/protobuf/archive/refs/tags/v21.12.zip;7cf2733949036c7d52fda017badcab093fe73bfa protoc_win64;https://github.com/protocolbuffers/protobuf/releases/download/v21.12/protoc-21.12-win64.zip;b4521f7ada5b260380f94c4bd7f1b7684c76969a protoc_win32;https://github.com/protocolbuffers/protobuf/releases/download/v21.12/protoc-21.12-win32.zip;3688010318192c46ce73213cdfb6b3e5656da874 diff --git a/onnxruntime/python/tools/tensorrt/perf/build/build_image.py b/onnxruntime/python/tools/tensorrt/perf/build/build_image.py index 3ebc33c02592d..541dc4978dad1 100644 --- a/onnxruntime/python/tools/tensorrt/perf/build/build_image.py +++ b/onnxruntime/python/tools/tensorrt/perf/build/build_image.py @@ -15,10 +15,10 @@ from typing import List, Optional TRT_DOCKER_FILES = { - "8.6.cuda_11_8_cudnn_8": "tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_8_tensorrt8_6", - "8.6.cuda_12_3_cudnn_9": "tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda12_3_tensorrt8_6", - "10.5.cuda_11_8_cudnn_8": "tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_tensorrt10", - "10.5.cuda_12_5_cudnn_9": "tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda12_tensorrt10", + "8.6_cuda11.8_cudnn8": "tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_8_tensorrt8_6", + "8.6_cuda12.3_cudnn9": "tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda12_3_tensorrt8_6", + "10.7_cuda11.8_cudnn8": "tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_tensorrt10", + "10.7_cuda12.5_cudnn9": "tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda12_tensorrt10", "BIN": "tools/ci_build/github/linux/docker/Dockerfile.ubuntu_tensorrt_bin", } diff --git a/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-daily-perf-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-daily-perf-pipeline.yml index 83cf26614a285..9286b5a54ac27 100644 --- a/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-daily-perf-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-daily-perf-pipeline.yml @@ -8,12 +8,12 @@ parameters: - name: TrtVersion displayName: TensorRT Version type: string - default: 10.5.cuda_12_5_cudnn_9 + default: 10.7_cuda12.5_cudnn9 values: - - 8.6.cuda_11_8_cudnn_8 - - 8.6.cuda_12_3_cudnn_9 - - 10.5.cuda_11_8_cudnn_8 - - 10.5.cuda_12_5_cudnn_9 + - 8.6_cuda11.8_cudnn8 + - 8.6_cuda12.3_cudnn9 + - 10.7_cuda11.8_cudnn8 + - 10.7_cuda12.5_cudnn9 - BIN - name: UseTensorrtOssParser @@ -198,4 +198,4 @@ jobs: parameters : condition : 'succeeded' - - template: templates/clean-agent-build-directory-step.yml + - template: templates/clean-agent-build-directory-step.yml \ No newline at end of file diff --git a/tools/ci_build/github/azure-pipelines/py-cuda-alt-package-test-pipeline.yml b/tools/ci_build/github/azure-pipelines/py-cuda-alt-package-test-pipeline.yml index 9296928ad97e0..cf434e4eadf0d 100644 --- a/tools/ci_build/github/azure-pipelines/py-cuda-alt-package-test-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/py-cuda-alt-package-test-pipeline.yml @@ -19,6 +19,6 @@ stages: python_wheel_suffix: '_gpu' timeout: 480 docker_base_image: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11:20241120.3 - trt_version: '10.6.0.26-1.cuda11.8' + trt_version: '10.7.0.23-1.cuda11.8' cuda_version: '11.8' diff --git a/tools/ci_build/github/azure-pipelines/templates/common-variables.yml b/tools/ci_build/github/azure-pipelines/templates/common-variables.yml index d35bed69ee409..3d4e5326ae7c6 100644 --- a/tools/ci_build/github/azure-pipelines/templates/common-variables.yml +++ b/tools/ci_build/github/azure-pipelines/templates/common-variables.yml @@ -1,5 +1,5 @@ variables: - common_trt_version: '10.6.0.26' + common_trt_version: '10.7.0.23' # As for Debian installation, replace '-1.' by '-1+' when assigning trt version below linux_trt_version_cuda11: ${{ variables.common_trt_version }}-1.cuda11.8 linux_trt_version_cuda12: ${{ variables.common_trt_version }}-1.cuda12.6 diff --git a/tools/ci_build/github/azure-pipelines/templates/download-deps.yml b/tools/ci_build/github/azure-pipelines/templates/download-deps.yml index 949479fb8b5e4..8409edb4d0429 100644 --- a/tools/ci_build/github/azure-pipelines/templates/download-deps.yml +++ b/tools/ci_build/github/azure-pipelines/templates/download-deps.yml @@ -11,7 +11,7 @@ steps: packageType: upack feed: '/7424c8e4-5c62-490e-95c4-79446f31017c' definition: '517c4f6f-5437-4392-a70d-4f15ec5be2f0' - version: 1.0.201 + version: 1.0.202 downloadPath: $(Build.BinariesDirectory)/deps # The private ADO project @@ -22,7 +22,7 @@ steps: packageType: upack feed: '/4c7631f5-24c0-4307-8822-1aa8f180c325' definition: 'fd9dd5ad-b73e-4678-890e-edcf680dbc1a' - version: 1.0.201 + version: 1.0.202 downloadPath: $(Build.BinariesDirectory)/deps # You can add more ADO accounts at here. diff --git a/tools/ci_build/github/azure-pipelines/templates/jobs/download_win_gpu_library.yml b/tools/ci_build/github/azure-pipelines/templates/jobs/download_win_gpu_library.yml index ae54b3849a862..14b9c378bec14 100644 --- a/tools/ci_build/github/azure-pipelines/templates/jobs/download_win_gpu_library.yml +++ b/tools/ci_build/github/azure-pipelines/templates/jobs/download_win_gpu_library.yml @@ -13,10 +13,10 @@ parameters: - 12.2 - name: TrtVersion type: string - default: '10.6.0.26' + default: '10.7.0.23' values: - 8.6.1.6 - - 10.6.0.26 + - 10.7.0.23 steps: - ${{ if eq(parameters.DownloadCUDA, true) }}: @@ -42,7 +42,7 @@ steps: - powershell: | Write-Host "##vso[task.setvariable variable=trtCudaVersion;]12.0" displayName: Set trtCudaVersion - - ${{ if and(eq(parameters.CudaVersion, '12.2'), eq(parameters.TrtVersion, '10.6.0.26')) }}: + - ${{ if and(eq(parameters.CudaVersion, '12.2'), eq(parameters.TrtVersion, '10.7.0.23')) }}: - powershell: | Write-Host "##vso[task.setvariable variable=trtCudaVersion;]12.6" displayName: Set trtCudaVersion diff --git a/tools/ci_build/github/azure-pipelines/templates/jobs/set-winenv.yml b/tools/ci_build/github/azure-pipelines/templates/jobs/set-winenv.yml index dfaf237a711fe..45572416350c3 100644 --- a/tools/ci_build/github/azure-pipelines/templates/jobs/set-winenv.yml +++ b/tools/ci_build/github/azure-pipelines/templates/jobs/set-winenv.yml @@ -15,10 +15,10 @@ parameters: default: '11.8' - name: win_trt_folder_cuda11 type: string - default: 'TensorRT-10.6.0.26.Windows10.x86_64.cuda-11.8' + default: 'TensorRT-10.7.0.23.Windows10.x86_64.cuda-11.8' - name: win_trt_folder_cuda12 type: string - default: 'TensorRT-10.6.0.26.Windows10.x86_64.cuda-12.6' + default: 'TensorRT-10.7.0.23.Windows10.x86_64.cuda-12.6' steps: - ${{ if eq(parameters.DownloadCUDA, 'true') }}: diff --git a/tools/ci_build/github/linux/docker/Dockerfile.package_ubi8_cuda_tensorrt10_0 b/tools/ci_build/github/linux/docker/Dockerfile.package_ubi8_cuda_tensorrt10_0 index c2bae5fd7ee59..df5112dc38af4 100644 --- a/tools/ci_build/github/linux/docker/Dockerfile.package_ubi8_cuda_tensorrt10_0 +++ b/tools/ci_build/github/linux/docker/Dockerfile.package_ubi8_cuda_tensorrt10_0 @@ -6,7 +6,7 @@ # Build base image with required system packages ARG BASEIMAGE=nvidia/cuda:12.5.1-cudnn-devel-ubi8 -ARG TRT_VERSION=10.6.0.26-1.cuda12.6 +ARG TRT_VERSION=10.7.0.23-1.cuda12.6 FROM $BASEIMAGE AS base ARG TRT_VERSION ENV PATH=/opt/python/cp310-cp310/bin:/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/src/tensorrt/bin:${PATH} diff --git a/tools/ci_build/github/linux/docker/Dockerfile.package_ubi8_cuda_tensorrt10_0_torch b/tools/ci_build/github/linux/docker/Dockerfile.package_ubi8_cuda_tensorrt10_0_torch index 2ecc6d1918b1a..fef95b8574520 100644 --- a/tools/ci_build/github/linux/docker/Dockerfile.package_ubi8_cuda_tensorrt10_0_torch +++ b/tools/ci_build/github/linux/docker/Dockerfile.package_ubi8_cuda_tensorrt10_0_torch @@ -6,7 +6,7 @@ # Build base image with required system packages ARG BASEIMAGE=nvidia/cuda:11.8.0-cudnn8-devel-ubi8 -ARG TRT_VERSION=10.6.0.26-1.cuda11.8 +ARG TRT_VERSION=10.7.0.23-1.cuda11.8 FROM $BASEIMAGE AS base ARG TRT_VERSION ENV PATH=/opt/python/cp310-cp310/bin:/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/src/tensorrt/bin:${PATH} diff --git a/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2004_gpu b/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2004_gpu index 81aeada6a4a46..e91f14ff955b9 100644 --- a/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2004_gpu +++ b/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2004_gpu @@ -6,7 +6,7 @@ # Build base image with required system packages ARG BASEIMAGE=nvidia/cuda:11.8.0-cudnn8-devel-ubuntu20.04 -ARG TRT_VERSION=10.6.0.26-1+cuda11.8 +ARG TRT_VERSION=10.7.0.23-1+cuda11.8 ARG LD_LIBRARY_PATH_ARG=/usr/local/lib64:/usr/local/cuda/lib64 FROM $BASEIMAGE AS base ARG TRT_VERSION diff --git a/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2204_gpu_ffmpeg b/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2204_gpu_ffmpeg index 4298dd53e4c66..0b08d4b3024b8 100644 --- a/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2204_gpu_ffmpeg +++ b/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2204_gpu_ffmpeg @@ -6,7 +6,7 @@ # Build base image with required system packages ARG BASEIMAGE=nvidia/cuda:11.8.0-cudnn8-devel-ubuntu22.04 -ARG TRT_VERSION=10.6.0.26-1+cuda11.8 +ARG TRT_VERSION=10.7.0.23-1+cuda11.8 ARG LD_LIBRARY_PATH_ARG=/usr/local/lib64:/usr/local/cuda/lib64 FROM $BASEIMAGE AS base ARG TRT_VERSION diff --git a/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2204_gpu_opencv b/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2204_gpu_opencv index 1312475ceca3a..3a7e064686ae5 100644 --- a/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2204_gpu_opencv +++ b/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2204_gpu_opencv @@ -6,7 +6,7 @@ # Build base image with required system packages ARG BASEIMAGE=nvidia/cuda:11.8.0-cudnn8-devel-ubuntu22.04 -ARG TRT_VERSION=10.6.0.26-1+cuda11.8 +ARG TRT_VERSION=10.7.0.23-1+cuda11.8 ARG LD_LIBRARY_PATH_ARG=/usr/local/lib64:/usr/local/cuda/lib64 FROM $BASEIMAGE AS base ARG TRT_VERSION diff --git a/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_tensorrt10 b/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_tensorrt10 index 22d5e3b0248a8..01f08ff41e2cc 100644 --- a/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_tensorrt10 +++ b/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda11_tensorrt10 @@ -31,7 +31,7 @@ RUN pip install --upgrade pip RUN pip install psutil setuptools>=68.2.2 # Install TensorRT -RUN TRT_VERSION="10.6.0.26-1+cuda11.8" &&\ +RUN TRT_VERSION="10.7.0.23-1+cuda11.8" &&\ apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/7fa2af80.pub &&\ apt-get update &&\ apt-get install -y \ diff --git a/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda12_tensorrt10 b/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda12_tensorrt10 index 819d9bab7be75..781f0647a084b 100644 --- a/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda12_tensorrt10 +++ b/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_cuda12_tensorrt10 @@ -31,7 +31,7 @@ RUN pip install --upgrade pip RUN pip install setuptools>=68.2.2 psutil # Install TensorRT -RUN TRT_VERSION="10.6.0.26-1+cuda12.6" &&\ +RUN TRT_VERSION="10.7.0.23-1+cuda12.6" &&\ apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/7fa2af80.pub &&\ apt-get update &&\ apt-get install -y \ diff --git a/tools/ci_build/github/linux/docker/inference/x86_64/python/cuda/Dockerfile b/tools/ci_build/github/linux/docker/inference/x86_64/python/cuda/Dockerfile index a69b98f86ba1b..5f10607b11626 100644 --- a/tools/ci_build/github/linux/docker/inference/x86_64/python/cuda/Dockerfile +++ b/tools/ci_build/github/linux/docker/inference/x86_64/python/cuda/Dockerfile @@ -5,7 +5,7 @@ ARG BASEIMAGE=nvidia/cuda:11.8.0-cudnn8-devel-ubi8 FROM $BASEIMAGE -ARG TRT_VERSION=10.6.0.26-1.cuda11.8 +ARG TRT_VERSION=10.7.0.23-1.cuda11.8 #Install TensorRT only if TRT_VERSION is not empty RUN if [ -n "${TRT_VERSION}" ]; then \ diff --git a/tools/ci_build/github/windows/setup_env_gpu.bat b/tools/ci_build/github/windows/setup_env_gpu.bat index 34ddd75da16fc..4e2bd8f8386e2 100644 --- a/tools/ci_build/github/windows/setup_env_gpu.bat +++ b/tools/ci_build/github/windows/setup_env_gpu.bat @@ -6,10 +6,10 @@ if exist PATH=%AGENT_TEMPDIRECTORY%\v12.2\ ( ) else ( set PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.2\bin;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.2\extras\CUPTI\lib64;%PATH% ) -set PATH=%AGENT_TEMPDIRECTORY%\TensorRT-10.6.0.26.Windows10.x86_64.cuda-12.6\lib;%PATH% +set PATH=%AGENT_TEMPDIRECTORY%\TensorRT-10.7.0.23.Windows10.x86_64.cuda-12.6\lib;%PATH% @REM The default version is still cuda v12.2, because set cuda v11.8 after it -set PATH=%PATH%;%AGENT_TEMPDIRECTORY%\TensorRT-10.6.0.26.Windows10.x86_64.cuda-11.8\lib +set PATH=%PATH%;%AGENT_TEMPDIRECTORY%\TensorRT-10.7.0.23.Windows10.x86_64.cuda-11.8\lib if exist PATH=%AGENT_TEMPDIRECTORY%\v11.8\ ( set PATH=%PATH%;%AGENT_TEMPDIRECTORY%\v11.8\bin;%AGENT_TEMPDIRECTORY%\v11.8\extras\CUPTI\lib64 ) else ( diff --git a/tools/ci_build/github/windows/setup_env_trt.bat b/tools/ci_build/github/windows/setup_env_trt.bat index 03734293be5c4..6a602e46661e7 100644 --- a/tools/ci_build/github/windows/setup_env_trt.bat +++ b/tools/ci_build/github/windows/setup_env_trt.bat @@ -6,6 +6,6 @@ if exist PATH=%AGENT_TEMPDIRECTORY%\v12.2\ ( ) else ( set PATH=%PATH%;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.2\bin;C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.2\extras\CUPTI\lib64 ) -set PATH=%AGENT_TEMPDIRECTORY%\TensorRT-10.6.0.26.Windows10.x86_64.cuda-12.6\lib;%PATH% +set PATH=%AGENT_TEMPDIRECTORY%\TensorRT-10.7.0.23.Windows10.x86_64.cuda-12.6\lib;%PATH% set GRADLE_OPTS=-Dorg.gradle.daemon=false set CUDA_MODULE_LOADING=LAZY