diff --git a/.github/workflows/publish-c-apidocs.yml b/.github/workflows/publish-c-apidocs.yml index 6c4dc43847d0b..72e69f6117ce9 100644 --- a/.github/workflows/publish-c-apidocs.yml +++ b/.github/workflows/publish-c-apidocs.yml @@ -9,7 +9,7 @@ on: - include/onnxruntime/core/session/** - orttraining/orttraining/training_api/include/** schedule: - - cron: '0 0 1 * *' + - cron: '0 0 1,15 * *' workflow_dispatch: concurrency: diff --git a/.github/workflows/publish-csharp-apidocs.yml b/.github/workflows/publish-csharp-apidocs.yml index 862a7a70e33a2..81ba703e8d5c1 100644 --- a/.github/workflows/publish-csharp-apidocs.yml +++ b/.github/workflows/publish-csharp-apidocs.yml @@ -8,7 +8,7 @@ on: paths: - csharp/** schedule: - - cron: '0 0 1 * *' + - cron: '0 0 1,15 * *' workflow_dispatch: concurrency: diff --git a/.github/workflows/publish-java-apidocs.yml b/.github/workflows/publish-java-apidocs.yml index 9e42dca708a17..bed96b1be7027 100644 --- a/.github/workflows/publish-java-apidocs.yml +++ b/.github/workflows/publish-java-apidocs.yml @@ -8,7 +8,7 @@ on: paths: - java/** schedule: - - cron: '0 0 1 * *' + - cron: '0 0 1,15 * *' workflow_dispatch: concurrency: diff --git a/.github/workflows/publish-js-apidocs.yml b/.github/workflows/publish-js-apidocs.yml index cec4a52d39c93..7af635f3eb50a 100644 --- a/.github/workflows/publish-js-apidocs.yml +++ b/.github/workflows/publish-js-apidocs.yml @@ -8,7 +8,7 @@ on: paths: - js/common/** schedule: - - cron: '0 0 1 * *' + - cron: '0 0 1,15 * *' workflow_dispatch: concurrency: diff --git a/.github/workflows/publish-objectivec-apidocs.yml b/.github/workflows/publish-objectivec-apidocs.yml index a8b81c8d5cf84..deef64f73f15a 100644 --- a/.github/workflows/publish-objectivec-apidocs.yml +++ b/.github/workflows/publish-objectivec-apidocs.yml @@ -8,7 +8,7 @@ on: paths: - objectivec/** schedule: - - cron: '0 0 1 * *' + - cron: '0 0 1,15 * *' workflow_dispatch: concurrency: diff --git a/.github/workflows/publish-python-apidocs.yml b/.github/workflows/publish-python-apidocs.yml index 8b2f72d80bacf..352fd3e948b4b 100644 --- a/.github/workflows/publish-python-apidocs.yml +++ b/.github/workflows/publish-python-apidocs.yml @@ -9,7 +9,7 @@ on: - onnxruntime/python/** - docs/python/** schedule: - - cron: '0 0 1 * *' + - cron: '0 0 1,15 * *' workflow_dispatch: concurrency: diff --git a/ThirdPartyNotices.txt b/ThirdPartyNotices.txt index 6a11f414361bd..20142e734dfac 100644 --- a/ThirdPartyNotices.txt +++ b/ThirdPartyNotices.txt @@ -2492,212 +2492,6 @@ DAMAGE. _____ -google/nsync - -Apache License - Version 2.0, January 2004 - http://www.apache.org/licenses/ - - TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION - - 1. Definitions. - - "License" shall mean the terms and conditions for use, reproduction, - and distribution as defined by Sections 1 through 9 of this document. - - "Licensor" shall mean the copyright owner or entity authorized by - the copyright owner that is granting the License. - - "Legal Entity" shall mean the union of the acting entity and all - other entities that control, are controlled by, or are under common - control with that entity. For the purposes of this definition, - "control" means (i) the power, direct or indirect, to cause the - direction or management of such entity, whether by contract or - otherwise, or (ii) ownership of fifty percent (50%) or more of the - outstanding shares, or (iii) beneficial ownership of such entity. - - "You" (or "Your") shall mean an individual or Legal Entity - exercising permissions granted by this License. - - "Source" form shall mean the preferred form for making modifications, - including but not limited to software source code, documentation - source, and configuration files. - - "Object" form shall mean any form resulting from mechanical - transformation or translation of a Source form, including but - not limited to compiled object code, generated documentation, - and conversions to other media types. - - "Work" shall mean the work of authorship, whether in Source or - Object form, made available under the License, as indicated by a - copyright notice that is included in or attached to the work - (an example is provided in the Appendix below). - - "Derivative Works" shall mean any work, whether in Source or Object - form, that is based on (or derived from) the Work and for which the - editorial revisions, annotations, elaborations, or other modifications - represent, as a whole, an original work of authorship. For the purposes - of this License, Derivative Works shall not include works that remain - separable from, or merely link (or bind by name) to the interfaces of, - the Work and Derivative Works thereof. - - "Contribution" shall mean any work of authorship, including - the original version of the Work and any modifications or additions - to that Work or Derivative Works thereof, that is intentionally - submitted to Licensor for inclusion in the Work by the copyright owner - or by an individual or Legal Entity authorized to submit on behalf of - the copyright owner. For the purposes of this definition, "submitted" - means any form of electronic, verbal, or written communication sent - to the Licensor or its representatives, including but not limited to - communication on electronic mailing lists, source code control systems, - and issue tracking systems that are managed by, or on behalf of, the - Licensor for the purpose of discussing and improving the Work, but - excluding communication that is conspicuously marked or otherwise - designated in writing by the copyright owner as "Not a Contribution." - - "Contributor" shall mean Licensor and any individual or Legal Entity - on behalf of whom a Contribution has been received by Licensor and - subsequently incorporated within the Work. - - 2. Grant of Copyright License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - copyright license to reproduce, prepare Derivative Works of, - publicly display, publicly perform, sublicense, and distribute the - Work and such Derivative Works in Source or Object form. - - 3. Grant of Patent License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - (except as stated in this section) patent license to make, have made, - use, offer to sell, sell, import, and otherwise transfer the Work, - where such license applies only to those patent claims licensable - by such Contributor that are necessarily infringed by their - Contribution(s) alone or by combination of their Contribution(s) - with the Work to which such Contribution(s) was submitted. If You - institute patent litigation against any entity (including a - cross-claim or counterclaim in a lawsuit) alleging that the Work - or a Contribution incorporated within the Work constitutes direct - or contributory patent infringement, then any patent licenses - granted to You under this License for that Work shall terminate - as of the date such litigation is filed. - - 4. Redistribution. You may reproduce and distribute copies of the - Work or Derivative Works thereof in any medium, with or without - modifications, and in Source or Object form, provided that You - meet the following conditions: - - (a) You must give any other recipients of the Work or - Derivative Works a copy of this License; and - - (b) You must cause any modified files to carry prominent notices - stating that You changed the files; and - - (c) You must retain, in the Source form of any Derivative Works - that You distribute, all copyright, patent, trademark, and - attribution notices from the Source form of the Work, - excluding those notices that do not pertain to any part of - the Derivative Works; and - - (d) If the Work includes a "NOTICE" text file as part of its - distribution, then any Derivative Works that You distribute must - include a readable copy of the attribution notices contained - within such NOTICE file, excluding those notices that do not - pertain to any part of the Derivative Works, in at least one - of the following places: within a NOTICE text file distributed - as part of the Derivative Works; within the Source form or - documentation, if provided along with the Derivative Works; or, - within a display generated by the Derivative Works, if and - wherever such third-party notices normally appear. The contents - of the NOTICE file are for informational purposes only and - do not modify the License. You may add Your own attribution - notices within Derivative Works that You distribute, alongside - or as an addendum to the NOTICE text from the Work, provided - that such additional attribution notices cannot be construed - as modifying the License. - - You may add Your own copyright statement to Your modifications and - may provide additional or different license terms and conditions - for use, reproduction, or distribution of Your modifications, or - for any such Derivative Works as a whole, provided Your use, - reproduction, and distribution of the Work otherwise complies with - the conditions stated in this License. - - 5. Submission of Contributions. Unless You explicitly state otherwise, - any Contribution intentionally submitted for inclusion in the Work - by You to the Licensor shall be under the terms and conditions of - this License, without any additional terms or conditions. - Notwithstanding the above, nothing herein shall supersede or modify - the terms of any separate license agreement you may have executed - with Licensor regarding such Contributions. - - 6. Trademarks. This License does not grant permission to use the trade - names, trademarks, service marks, or product names of the Licensor, - except as required for reasonable and customary use in describing the - origin of the Work and reproducing the content of the NOTICE file. - - 7. Disclaimer of Warranty. Unless required by applicable law or - agreed to in writing, Licensor provides the Work (and each - Contributor provides its Contributions) on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or - implied, including, without limitation, any warranties or conditions - of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A - PARTICULAR PURPOSE. You are solely responsible for determining the - appropriateness of using or redistributing the Work and assume any - risks associated with Your exercise of permissions under this License. - - 8. Limitation of Liability. In no event and under no legal theory, - whether in tort (including negligence), contract, or otherwise, - unless required by applicable law (such as deliberate and grossly - negligent acts) or agreed to in writing, shall any Contributor be - liable to You for damages, including any direct, indirect, special, - incidental, or consequential damages of any character arising as a - result of this License or out of the use or inability to use the - Work (including but not limited to damages for loss of goodwill, - work stoppage, computer failure or malfunction, or any and all - other commercial damages or losses), even if such Contributor - has been advised of the possibility of such damages. - - 9. Accepting Warranty or Additional Liability. While redistributing - the Work or Derivative Works thereof, You may choose to offer, - and charge a fee for, acceptance of support, warranty, indemnity, - or other liability obligations and/or rights consistent with this - License. However, in accepting such obligations, You may act only - on Your own behalf and on Your sole responsibility, not on behalf - of any other Contributor, and only if You agree to indemnify, - defend, and hold each Contributor harmless for any liability - incurred by, or claims asserted against, such Contributor by reason - of your accepting any such warranty or additional liability. - - END OF TERMS AND CONDITIONS - - APPENDIX: How to apply the Apache License to your work. - - To apply the Apache License to your work, attach the following - boilerplate notice, with the fields enclosed by brackets "[]" - replaced with your own identifying information. (Don't include - the brackets!) The text should be enclosed in the appropriate - comment syntax for the file format. We also recommend that a - file or class name and description of purpose be included on the - same "printed page" as the copyright notice for easier - identification within third-party archives. - - Copyright [yyyy] [name of copyright owner] - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. - -_____ - google/re2 Copyright (c) 2009 The RE2 Authors. All rights reserved. diff --git a/cgmanifests/generated/cgmanifest.json b/cgmanifests/generated/cgmanifest.json index dc27a39ef1420..c8236c7c529a6 100644 --- a/cgmanifests/generated/cgmanifest.json +++ b/cgmanifests/generated/cgmanifest.json @@ -122,16 +122,6 @@ "comments": "google_benchmark" } }, - { - "component": { - "type": "git", - "git": { - "commitHash": "13de152c2a1cd73ff4df97bd2c406b6d15d34af3", - "repositoryUrl": "https://github.com/google/nsync.git" - }, - "comments": "google_nsync" - } - }, { "component": { "type": "git", diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt index dab57e0c1f79f..9d1b39143016b 100644 --- a/cmake/CMakeLists.txt +++ b/cmake/CMakeLists.txt @@ -1083,8 +1083,6 @@ function(onnxruntime_set_compile_flags target_name) if (CMAKE_CXX_COMPILER_ID STREQUAL "IBMClang") target_compile_options(${target_name} PRIVATE "-Wno-unused-function") endif() - target_compile_definitions(${target_name} PUBLIC -DNSYNC_ATOMIC_CPP11) - onnxruntime_add_include_to_target(${target_name} nsync::nsync_cpp) endif() foreach(ORT_FLAG ${ORT_PROVIDER_FLAGS}) target_compile_definitions(${target_name} PRIVATE ${ORT_FLAG}) @@ -1673,7 +1671,6 @@ if (WIN32) list(APPEND onnxruntime_EXTERNAL_LIBRARIES advapi32) endif() else() - list(APPEND onnxruntime_EXTERNAL_LIBRARIES nsync::nsync_cpp) list(APPEND onnxruntime_EXTERNAL_LIBRARIES ${ICONV_LIB} ${CMAKE_DL_LIBS} Threads::Threads) endif() diff --git a/cmake/deps.txt b/cmake/deps.txt index 9219f16be0207..2aec0e35e1d7f 100644 --- a/cmake/deps.txt +++ b/cmake/deps.txt @@ -27,7 +27,6 @@ flatbuffers;https://github.com/google/flatbuffers/archive/refs/tags/v23.5.26.zip fp16;https://github.com/Maratyszcza/FP16/archive/0a92994d729ff76a58f692d3028ca1b64b145d91.zip;b985f6985a05a1c03ff1bb71190f66d8f98a1494 fxdiv;https://github.com/Maratyszcza/FXdiv/archive/63058eff77e11aa15bf531df5dd34395ec3017c8.zip;a5658f4036402dbca7cebee32be57fb8149811e1 google_benchmark;https://github.com/google/benchmark/archive/refs/tags/v1.8.5.zip;cd47d3d272faf353600c8cc2fdec2b52d6f69177 -google_nsync;https://github.com/google/nsync/archive/refs/tags/1.26.0.zip;5e7c00ef6bf5b787386fc040067903ec774e2752 googletest;https://github.com/google/googletest/archive/refs/tags/v1.15.0.zip;9d2d0af8d77ac726ea55d44a8fa727ec98311349 #xnnpack 2024.09.04 googlexnnpack;https://github.com/google/XNNPACK/archive/309b75c9e56e0a674bf78d59872ce131f814dfb6.zip;39FA5259EAEACE0547284B63D5CEDC4F05553F5A diff --git a/cmake/external/onnxruntime_external_deps.cmake b/cmake/external/onnxruntime_external_deps.cmake index 85746027d4e8c..a69d2649ad832 100644 --- a/cmake/external/onnxruntime_external_deps.cmake +++ b/cmake/external/onnxruntime_external_deps.cmake @@ -86,27 +86,6 @@ if (onnxruntime_BUILD_BENCHMARKS) onnxruntime_fetchcontent_makeavailable(google_benchmark) endif() -if (NOT WIN32) - FetchContent_Declare( - google_nsync - URL ${DEP_URL_google_nsync} - URL_HASH SHA1=${DEP_SHA1_google_nsync} - PATCH_COMMAND ${Patch_EXECUTABLE} --binary --ignore-whitespace -p1 < ${PROJECT_SOURCE_DIR}/patches/nsync/nsync_1.26.0.patch - FIND_PACKAGE_ARGS NAMES nsync unofficial-nsync - ) - #nsync tests failed on Mac Build - set(NSYNC_ENABLE_TESTS OFF CACHE BOOL "" FORCE) - onnxruntime_fetchcontent_makeavailable(google_nsync) - - if (google_nsync_SOURCE_DIR) - add_library(nsync::nsync_cpp ALIAS nsync_cpp) - target_include_directories(nsync_cpp PUBLIC ${google_nsync_SOURCE_DIR}/public) - endif() - if(TARGET unofficial::nsync::nsync_cpp AND NOT TARGET nsync::nsync_cpp) - message(STATUS "Aliasing unofficial::nsync::nsync_cpp to nsync::nsync_cpp") - add_library(nsync::nsync_cpp ALIAS unofficial::nsync::nsync_cpp) - endif() -endif() if(onnxruntime_USE_MIMALLOC) FetchContent_Declare( diff --git a/cmake/onnxruntime_mlas.cmake b/cmake/onnxruntime_mlas.cmake index 0ba4694c329e3..20bb1fb772189 100644 --- a/cmake/onnxruntime_mlas.cmake +++ b/cmake/onnxruntime_mlas.cmake @@ -743,7 +743,7 @@ if (NOT onnxruntime_ORT_MINIMAL_BUILD) target_link_libraries(onnxruntime_mlas_q4dq PRIVATE cpuinfo) endif() if(NOT WIN32) - target_link_libraries(onnxruntime_mlas_q4dq PRIVATE nsync::nsync_cpp ${CMAKE_DL_LIBS}) + target_link_libraries(onnxruntime_mlas_q4dq PRIVATE ${CMAKE_DL_LIBS}) endif() if (CMAKE_SYSTEM_NAME STREQUAL "Android") target_link_libraries(onnxruntime_mlas_q4dq PRIVATE ${android_shared_libs}) diff --git a/cmake/onnxruntime_providers_cann.cmake b/cmake/onnxruntime_providers_cann.cmake index 0e26f7ee3a57b..2b82379ed66a9 100644 --- a/cmake/onnxruntime_providers_cann.cmake +++ b/cmake/onnxruntime_providers_cann.cmake @@ -21,7 +21,7 @@ onnxruntime_add_include_to_target(onnxruntime_providers_cann onnxruntime_common onnxruntime_framework onnx onnx_proto ${PROTOBUF_LIB} flatbuffers::flatbuffers Boost::mp11 safeint_interface) add_dependencies(onnxruntime_providers_cann onnxruntime_providers_shared ${onnxruntime_EXTERNAL_DEPENDENCIES}) - target_link_libraries(onnxruntime_providers_cann PRIVATE ascendcl acl_op_compiler fmk_onnx_parser nsync::nsync_cpp ${ABSEIL_LIBS} ${ONNXRUNTIME_PROVIDERS_SHARED}) + target_link_libraries(onnxruntime_providers_cann PRIVATE ascendcl acl_op_compiler fmk_onnx_parser ${ABSEIL_LIBS} ${ONNXRUNTIME_PROVIDERS_SHARED}) target_link_directories(onnxruntime_providers_cann PRIVATE ${onnxruntime_CANN_HOME}/lib64) target_include_directories(onnxruntime_providers_cann PRIVATE ${ONNXRUNTIME_ROOT} ${CMAKE_CURRENT_BINARY_DIR} ${eigen_INCLUDE_DIRS} ${onnxruntime_CANN_HOME} ${onnxruntime_CANN_HOME}/include) diff --git a/cmake/onnxruntime_providers_cuda.cmake b/cmake/onnxruntime_providers_cuda.cmake index 774b7a4f6bd77..39ad530146b33 100644 --- a/cmake/onnxruntime_providers_cuda.cmake +++ b/cmake/onnxruntime_providers_cuda.cmake @@ -275,10 +275,8 @@ if(APPLE) set_property(TARGET ${target} APPEND_STRING PROPERTY LINK_FLAGS "-Xlinker -exported_symbols_list ${ONNXRUNTIME_ROOT}/core/providers/cuda/exported_symbols.lst") - target_link_libraries(${target} PRIVATE nsync::nsync_cpp) elseif(UNIX) set_property(TARGET ${target} APPEND_STRING PROPERTY LINK_FLAGS "-Xlinker --version-script=${ONNXRUNTIME_ROOT}/core/providers/cuda/version_script.lds -Xlinker --gc-sections") - target_link_libraries(${target} PRIVATE nsync::nsync_cpp) elseif(WIN32) set_property(TARGET ${target} APPEND_STRING PROPERTY LINK_FLAGS "-DEF:${ONNXRUNTIME_ROOT}/core/providers/cuda/symbols.def") else() diff --git a/cmake/onnxruntime_providers_dnnl.cmake b/cmake/onnxruntime_providers_dnnl.cmake index f2965728524b7..9e5a7eed44fff 100644 --- a/cmake/onnxruntime_providers_dnnl.cmake +++ b/cmake/onnxruntime_providers_dnnl.cmake @@ -41,10 +41,8 @@ INSTALL_RPATH "@loader_path" BUILD_WITH_INSTALL_RPATH TRUE INSTALL_RPATH_USE_LINK_PATH FALSE) - target_link_libraries(onnxruntime_providers_dnnl PRIVATE nsync::nsync_cpp) elseif(UNIX) set_property(TARGET onnxruntime_providers_dnnl APPEND_STRING PROPERTY LINK_FLAGS "-Xlinker --version-script=${ONNXRUNTIME_ROOT}/core/providers/dnnl/version_script.lds -Xlinker --gc-sections -Xlinker -rpath=\$ORIGIN") - target_link_libraries(onnxruntime_providers_dnnl PRIVATE nsync::nsync_cpp) elseif(WIN32) set_property(TARGET onnxruntime_providers_dnnl APPEND_STRING PROPERTY LINK_FLAGS "-DEF:${ONNXRUNTIME_ROOT}/core/providers/dnnl/symbols.def") else() diff --git a/cmake/onnxruntime_providers_migraphx.cmake b/cmake/onnxruntime_providers_migraphx.cmake index d7d83b0ce8d64..685e77bc483bd 100644 --- a/cmake/onnxruntime_providers_migraphx.cmake +++ b/cmake/onnxruntime_providers_migraphx.cmake @@ -57,7 +57,7 @@ endif() if(UNIX) set_property(TARGET onnxruntime_providers_migraphx APPEND_STRING PROPERTY LINK_FLAGS "-Xlinker --version-script=${ONNXRUNTIME_ROOT}/core/providers/migraphx/version_script.lds -Xlinker --gc-sections") - target_link_libraries(onnxruntime_providers_migraphx PRIVATE nsync::nsync_cpp stdc++fs) + target_link_libraries(onnxruntime_providers_migraphx PRIVATE stdc++fs) endif() if (onnxruntime_ENABLE_TRAINING_OPS) diff --git a/cmake/onnxruntime_providers_rocm.cmake b/cmake/onnxruntime_providers_rocm.cmake index 47cd151fb12ed..68f5319c0ae8d 100644 --- a/cmake/onnxruntime_providers_rocm.cmake +++ b/cmake/onnxruntime_providers_rocm.cmake @@ -217,7 +217,6 @@ if(UNIX) set_property(TARGET onnxruntime_providers_rocm APPEND_STRING PROPERTY LINK_FLAGS "-Xlinker --version-script=${ONNXRUNTIME_ROOT}/core/providers/rocm/version_script.lds -Xlinker --gc-sections") - target_link_libraries(onnxruntime_providers_rocm PRIVATE nsync::nsync_cpp) else() message(FATAL_ERROR "onnxruntime_providers_rocm unknown platform, need to specify shared library exports for it") endif() diff --git a/cmake/onnxruntime_providers_tensorrt.cmake b/cmake/onnxruntime_providers_tensorrt.cmake index 468aaa44ec4ee..7b18222f334f9 100644 --- a/cmake/onnxruntime_providers_tensorrt.cmake +++ b/cmake/onnxruntime_providers_tensorrt.cmake @@ -206,11 +206,9 @@ if(APPLE) set_property(TARGET onnxruntime_providers_tensorrt APPEND_STRING PROPERTY LINK_FLAGS "-Xlinker -exported_symbols_list ${ONNXRUNTIME_ROOT}/core/providers/tensorrt/exported_symbols.lst") - target_link_libraries(onnxruntime_providers_tensorrt PRIVATE nsync::nsync_cpp) elseif(UNIX) set_property(TARGET onnxruntime_providers_tensorrt APPEND_STRING PROPERTY COMPILE_FLAGS "-Wno-deprecated-declarations") set_property(TARGET onnxruntime_providers_tensorrt APPEND_STRING PROPERTY LINK_FLAGS "-Xlinker --version-script=${ONNXRUNTIME_ROOT}/core/providers/tensorrt/version_script.lds -Xlinker --gc-sections") - target_link_libraries(onnxruntime_providers_tensorrt PRIVATE nsync::nsync_cpp) elseif(WIN32) set_property(TARGET onnxruntime_providers_tensorrt APPEND_STRING PROPERTY LINK_FLAGS "-DEF:${ONNXRUNTIME_ROOT}/core/providers/tensorrt/symbols.def") else() diff --git a/cmake/onnxruntime_providers_vsinpu.cmake b/cmake/onnxruntime_providers_vsinpu.cmake index 4b987fd1e424b..e3b6c3c302c82 100644 --- a/cmake/onnxruntime_providers_vsinpu.cmake +++ b/cmake/onnxruntime_providers_vsinpu.cmake @@ -11,7 +11,7 @@ add_library(onnxruntime_providers_vsinpu ${onnxruntime_providers_vsinpu_srcs}) onnxruntime_add_include_to_target(onnxruntime_providers_vsinpu onnxruntime_common onnxruntime_framework onnx onnx_proto protobuf::libprotobuf-lite flatbuffers Boost::mp11 - safeint_interface nsync::nsync_cpp) + safeint_interface ) add_dependencies(onnxruntime_providers_vsinpu ${onnxruntime_EXTERNAL_DEPENDENCIES}) set_target_properties(onnxruntime_providers_vsinpu PROPERTIES FOLDER "ONNXRuntime" LINKER_LANGUAGE CXX) target_include_directories(onnxruntime_providers_vsinpu PRIVATE ${ONNXRUNTIME_ROOT} $ENV{TIM_VX_INSTALL}/include) diff --git a/cmake/onnxruntime_unittests.cmake b/cmake/onnxruntime_unittests.cmake index cbae6990cd0b6..67e5a9c0aa08b 100644 --- a/cmake/onnxruntime_unittests.cmake +++ b/cmake/onnxruntime_unittests.cmake @@ -766,9 +766,7 @@ if(MSVC) target_compile_options(onnxruntime_test_utils PRIVATE "$<$:SHELL:--compiler-options /wd6326>" "$<$>:/wd6326>") else() - target_compile_definitions(onnxruntime_test_utils PUBLIC -DNSYNC_ATOMIC_CPP11) target_include_directories(onnxruntime_test_utils PRIVATE ${CMAKE_CURRENT_BINARY_DIR} ${ONNXRUNTIME_ROOT}) - onnxruntime_add_include_to_target(onnxruntime_test_utils nsync::nsync_cpp) endif() if (onnxruntime_USE_NCCL) target_include_directories(onnxruntime_test_utils PRIVATE ${NCCL_INCLUDE_DIRS}) @@ -802,9 +800,7 @@ if(NOT IOS) target_compile_options(onnx_test_runner_common PRIVATE "$<$:SHELL:--compiler-options /utf-8>" "$<$>:/utf-8>") else() - target_compile_definitions(onnx_test_runner_common PUBLIC -DNSYNC_ATOMIC_CPP11) target_include_directories(onnx_test_runner_common PRIVATE ${CMAKE_CURRENT_BINARY_DIR} ${ONNXRUNTIME_ROOT}) - onnxruntime_add_include_to_target(onnx_test_runner_common nsync::nsync_cpp) endif() if (MSVC AND NOT CMAKE_SIZEOF_VOID_P EQUAL 8) #TODO: fix the warnings, they are dangerous @@ -1207,7 +1203,7 @@ if (NOT onnxruntime_ENABLE_TRAINING_TORCH_INTEROP) # "Global initializer calls a non-constexpr function." BENCHMARK_CAPTURE macro needs this. target_compile_options(onnxruntime_mlas_benchmark PRIVATE /wd26426) else() - target_link_libraries(onnxruntime_mlas_benchmark PRIVATE nsync::nsync_cpp ${CMAKE_DL_LIBS}) + target_link_libraries(onnxruntime_mlas_benchmark PRIVATE ${CMAKE_DL_LIBS}) endif() if (CPUINFO_SUPPORTED AND NOT CMAKE_SYSTEM_NAME STREQUAL "Emscripten") target_link_libraries(onnxruntime_mlas_benchmark PRIVATE cpuinfo) @@ -1280,7 +1276,6 @@ if (NOT onnxruntime_ENABLE_TRAINING_TORCH_INTEROP) ${onnxruntime_EXTERNAL_LIBRARIES} ${GETOPT_LIB_WIDE} ${SYS_PATH_LIB} ${CMAKE_DL_LIBS}) if(NOT WIN32) - list(APPEND onnxruntime_perf_test_libs nsync::nsync_cpp) if(onnxruntime_USE_SNPE) list(APPEND onnxruntime_perf_test_libs onnxruntime_providers_snpe) endif() @@ -1348,7 +1343,6 @@ if (NOT onnxruntime_ENABLE_TRAINING_TORCH_INTEROP) # test inference using shared lib set(onnxruntime_shared_lib_test_LIBS onnxruntime_mocked_allocator onnxruntime_test_utils onnxruntime_common onnx_proto) if(NOT WIN32) - list(APPEND onnxruntime_shared_lib_test_LIBS nsync::nsync_cpp) if(onnxruntime_USE_SNPE) list(APPEND onnxruntime_shared_lib_test_LIBS onnxruntime_providers_snpe) endif() @@ -1497,7 +1491,7 @@ if (NOT onnxruntime_ENABLE_TRAINING_TORCH_INTEROP) target_link_libraries(onnxruntime_mlas_test PRIVATE cpuinfo) endif() if(NOT WIN32) - target_link_libraries(onnxruntime_mlas_test PRIVATE nsync::nsync_cpp ${CMAKE_DL_LIBS}) + target_link_libraries(onnxruntime_mlas_test PRIVATE ${CMAKE_DL_LIBS}) endif() if (CMAKE_SYSTEM_NAME STREQUAL "Android") target_link_libraries(onnxruntime_mlas_test PRIVATE ${android_shared_libs}) @@ -1683,9 +1677,7 @@ if (NOT CMAKE_SYSTEM_NAME STREQUAL "Emscripten") ${ONNXRUNTIME_CUSTOM_OP_REGISTRATION_TEST_SRC_DIR}/test_registercustomops.cc) set(onnxruntime_customopregistration_test_LIBS custom_op_library onnxruntime_common onnxruntime_test_utils) - if (NOT WIN32) - list(APPEND onnxruntime_customopregistration_test_LIBS nsync::nsync_cpp) - endif() + if (CPUINFO_SUPPORTED AND NOT CMAKE_SYSTEM_NAME STREQUAL "Emscripten") list(APPEND onnxruntime_customopregistration_test_LIBS cpuinfo) endif() @@ -1693,7 +1685,7 @@ if (NOT CMAKE_SYSTEM_NAME STREQUAL "Emscripten") list(APPEND onnxruntime_customopregistration_test_LIBS ${TENSORRT_LIBRARY_INFER}) endif() if (${CMAKE_SYSTEM_NAME} MATCHES "AIX") - list(APPEND onnxruntime_customopregistration_test_LIBS onnxruntime_graph onnxruntime_session onnxruntime_providers onnxruntime_lora onnxruntime_framework onnxruntime_util onnxruntime_mlas onnxruntime_optimizer onnxruntime_flatbuffers iconv re2 ${PROTOBUF_LIB} onnx onnx_proto nsync_cpp) + list(APPEND onnxruntime_customopregistration_test_LIBS onnxruntime_graph onnxruntime_session onnxruntime_providers onnxruntime_lora onnxruntime_framework onnxruntime_util onnxruntime_mlas onnxruntime_optimizer onnxruntime_flatbuffers iconv re2 ${PROTOBUF_LIB} onnx onnx_proto) endif() AddTest(DYN TARGET onnxruntime_customopregistration_test @@ -1812,11 +1804,11 @@ if (onnxruntime_BUILD_SHARED_LIB AND NOT CMAKE_SYSTEM_NAME STREQUAL "Emscripten" set(onnxruntime_logging_apis_test_LIBS onnxruntime_common onnxruntime_test_utils) if (${CMAKE_SYSTEM_NAME} MATCHES "AIX") - list(APPEND onnxruntime_logging_apis_test_LIBS onnxruntime_session onnxruntime_util onnxruntime_lora onnxruntime_framework onnxruntime_common onnxruntime_graph onnxruntime_providers onnxruntime_mlas onnxruntime_optimizer onnxruntime_flatbuffers iconv re2 ${PROTOBUF_LIB} onnx onnx_proto nsync_cpp) + list(APPEND onnxruntime_logging_apis_test_LIBS onnxruntime_session onnxruntime_util onnxruntime_lora onnxruntime_framework onnxruntime_common onnxruntime_graph onnxruntime_providers onnxruntime_mlas onnxruntime_optimizer onnxruntime_flatbuffers iconv re2 ${PROTOBUF_LIB} onnx onnx_proto) endif() if(NOT WIN32) - list(APPEND onnxruntime_logging_apis_test_LIBS nsync::nsync_cpp ${CMAKE_DL_LIBS}) + list(APPEND onnxruntime_logging_apis_test_LIBS ${CMAKE_DL_LIBS}) endif() AddTest(DYN diff --git a/cmake/onnxruntime_webassembly.cmake b/cmake/onnxruntime_webassembly.cmake index 1428307356199..2db0c7f566818 100644 --- a/cmake/onnxruntime_webassembly.cmake +++ b/cmake/onnxruntime_webassembly.cmake @@ -97,7 +97,7 @@ target_compile_options(onnx PRIVATE -Wno-unused-parameter -Wno-unused-variable) if (onnxruntime_BUILD_WEBASSEMBLY_STATIC_LIB) bundle_static_library(onnxruntime_webassembly - nsync::nsync_cpp + ${PROTOBUF_LIB} onnx onnx_proto @@ -175,7 +175,7 @@ else() endif() target_link_libraries(onnxruntime_webassembly PRIVATE - nsync::nsync_cpp + ${PROTOBUF_LIB} onnx onnx_proto diff --git a/docs/OperatorKernels.md b/docs/OperatorKernels.md index d8de7756bae22..ddf37cfded77d 100644 --- a/docs/OperatorKernels.md +++ b/docs/OperatorKernels.md @@ -258,7 +258,8 @@ Do not modify directly.* |||12|**T** = tensor(double), tensor(float), tensor(int32), tensor(int64)
**T1** = tensor(double), tensor(float), tensor(int32), tensor(int64)| |||[7, 11]|**T** = tensor(double), tensor(float)| |QLinearConv|*in* x:**T1**
*in* x_scale:**tensor(float)**
*in* x_zero_point:**T1**
*in* w:**T2**
*in* w_scale:**tensor(float)**
*in* w_zero_point:**T2**
*in* y_scale:**tensor(float)**
*in* y_zero_point:**T3**
*in* B:**T4**
*out* y:**T3**|10+|**T1** = tensor(int8), tensor(uint8)
**T2** = tensor(int8), tensor(uint8)
**T3** = tensor(int8), tensor(uint8)
**T4** = tensor(int32)| -|QLinearMatMul|*in* a:**T1**
*in* a_scale:**TS**
*in* a_zero_point:**T1**
*in* b:**T2**
*in* b_scale:**TS**
*in* b_zero_point:**T2**
*in* y_scale:**TS**
*in* y_zero_point:**T3**
*out* y:**T3**

or

*in* a:**T1**
*in* a_scale:**tensor(float)**
*in* a_zero_point:**T1**
*in* b:**T2**
*in* b_scale:**tensor(float)**
*in* b_zero_point:**T2**
*in* y_scale:**tensor(float)**
*in* y_zero_point:**T3**
*out* y:**T3**|10+|**T1** = tensor(int8), tensor(uint8)
**T2** = tensor(int8), tensor(uint8)
**T3** = tensor(int8), tensor(uint8)| +|QLinearMatMul|*in* a:**T1**
*in* a_scale:**TS**
*in* a_zero_point:**T1**
*in* b:**T2**
*in* b_scale:**TS**
*in* b_zero_point:**T2**
*in* y_scale:**TS**
*in* y_zero_point:**T3**
*out* y:**T3**

or

*in* a:**T1**
*in* a_scale:**tensor(float)**
*in* a_zero_point:**T1**
*in* b:**T2**
*in* b_scale:**tensor(float)**
*in* b_zero_point:**T2**
*in* y_scale:**tensor(float)**
*in* y_zero_point:**T3**
*out* y:**T3**|21+|**T1** = tensor(int8), tensor(uint8)
**T2** = tensor(int8), tensor(uint8)
**T3** = tensor(int8), tensor(uint8)
**TS** = tensor(float)| +|||[10, 20]|**T1** = tensor(int8), tensor(uint8)
**T2** = tensor(int8), tensor(uint8)
**T3** = tensor(int8), tensor(uint8)| |QuantizeLinear|*in* x:**T1**
*in* y_scale:**T1**
*in* y_zero_point:**T2**
*out* y:**T2**

or

*in* x:**T1**
*in* y_scale:**tensor(float)**
*in* y_zero_point:**T2**
*out* y:**T2**|21+|**T1** = tensor(float), tensor(float16)
**T2** = tensor(float8e4m3fn), tensor(float8e4m3fnuz), tensor(float8e5m2), tensor(float8e5m2fnuz), tensor(int16), tensor(int4), tensor(int8), tensor(uint16), tensor(uint4), tensor(uint8)| |||[19, 20]|**T1** = tensor(float), tensor(float16)
**T2** = tensor(float8e4m3fn), tensor(float8e4m3fnuz), tensor(float8e5m2), tensor(float8e5m2fnuz), tensor(int8), tensor(uint8)| |||[13, 18]|**T1** = tensor(float)
**T2** = tensor(int8), tensor(uint8)| diff --git a/include/onnxruntime/core/common/logging/logging.h b/include/onnxruntime/core/common/logging/logging.h index 9cdf42e222051..ab2c476f2975a 100644 --- a/include/onnxruntime/core/common/logging/logging.h +++ b/include/onnxruntime/core/common/logging/logging.h @@ -17,7 +17,6 @@ #include "core/common/logging/macros.h" #include "core/common/logging/severity.h" #include "core/common/logging/sink_types.h" -#include "core/platform/ort_mutex.h" #include "date/date.h" /* @@ -259,7 +258,7 @@ class LoggingManager final { std::unique_ptr sink_; #ifdef _WIN32 - mutable OrtMutex sink_mutex_; + mutable std::mutex sink_mutex_; #endif Severity default_min_severity_; const bool default_filter_user_data_; diff --git a/include/onnxruntime/core/graph/schema_registry.h b/include/onnxruntime/core/graph/schema_registry.h index b128e91afa9ae..ca51e3621b2c6 100644 --- a/include/onnxruntime/core/graph/schema_registry.h +++ b/include/onnxruntime/core/graph/schema_registry.h @@ -12,7 +12,6 @@ #include "core/graph/constants.h" #include "core/common/common.h" #include "core/common/status.h" -#include "core/platform/ort_mutex.h" namespace onnxruntime { using OpName_Domain_Version_Schema_Map = std::unordered_map< @@ -102,7 +101,7 @@ class OnnxRuntimeOpSchemaRegistry : public IOnnxRuntimeOpSchemaCollection { common::Status RegisterOpSchemaInternal(ONNX_NAMESPACE::OpSchema&& op_schema); - OrtMutex mutex_; + std::mutex mutex_; OpName_Domain_Version_Schema_Map map_; DomainToVersionRangeMap domain_version_range_map_; diff --git a/include/onnxruntime/core/platform/Barrier.h b/include/onnxruntime/core/platform/Barrier.h index 1148b052bd9af..bddc3ba8903f6 100644 --- a/include/onnxruntime/core/platform/Barrier.h +++ b/include/onnxruntime/core/platform/Barrier.h @@ -10,9 +10,9 @@ #include #include "core/common/spin_pause.h" -#include "core/platform/ort_mutex.h" #include +#include #include namespace onnxruntime { @@ -40,7 +40,7 @@ class Barrier { assert(((v + delta) & ~1) != 0); return; // either count has not dropped to 0, or waiter is not waiting } - std::unique_lock l(mu_); + std::unique_lock l(mu_); assert(!notified_); notified_ = true; cv_.notify_all(); @@ -55,7 +55,7 @@ class Barrier { unsigned int v = state_.fetch_or(1, std::memory_order_acq_rel); if ((v >> 1) == 0) return; - std::unique_lock l(mu_); + std::unique_lock l(mu_); while (!notified_) { cv_.wait(l); } @@ -63,8 +63,8 @@ class Barrier { } private: - OrtMutex mu_; - OrtCondVar cv_; + std::mutex mu_; + std::condition_variable cv_; std::atomic state_; // low bit is waiter flag bool notified_; const bool spin_; diff --git a/include/onnxruntime/core/platform/EigenNonBlockingThreadPool.h b/include/onnxruntime/core/platform/EigenNonBlockingThreadPool.h index d4411a6d72356..27b14f008a8ba 100644 --- a/include/onnxruntime/core/platform/EigenNonBlockingThreadPool.h +++ b/include/onnxruntime/core/platform/EigenNonBlockingThreadPool.h @@ -50,7 +50,6 @@ #include "core/common/denormal.h" #include "core/common/inlined_containers_fwd.h" #include "core/common/spin_pause.h" -#include "core/platform/ort_mutex.h" #include "core/platform/ort_spin_lock.h" #include "core/platform/Barrier.h" @@ -460,7 +459,7 @@ class RunQueue { #ifdef USE_LOCK_FREE_QUEUE std::lock_guard mtx(spin_lock_); #else - std::lock_guard lock(mutex_); + std::lock_guard lock(mutex_); #endif unsigned back = back_.load(std::memory_order_relaxed); Elem& e = array_[(back - 1) & kMask]; @@ -484,7 +483,7 @@ class RunQueue { #ifdef USE_LOCK_FREE_QUEUE std::lock_guard mtx(spin_lock_); #else - std::lock_guard lock(mutex_); + std::lock_guard lock(mutex_); #endif unsigned back = back_.load(std::memory_order_relaxed); w_idx = (back - 1) & kMask; @@ -509,7 +508,7 @@ class RunQueue { #ifdef USE_LOCK_FREE_QUEUE std::lock_guard mtx(spin_lock_); #else - std::lock_guard lock(mutex_); + std::lock_guard lock(mutex_); #endif unsigned back; Elem* e; @@ -555,7 +554,7 @@ class RunQueue { #ifdef USE_LOCK_FREE_QUEUE std::lock_guard mtx(spin_lock_); #else - std::lock_guard lock(mutex_); + std::lock_guard lock(mutex_); #endif Elem& e = array_[w_idx]; ElemState s = e.state.load(std::memory_order_relaxed); @@ -631,7 +630,7 @@ class RunQueue { #ifdef USE_LOCK_FREE_QUEUE OrtSpinLock spin_lock_; #else - OrtMutex mutex_; + std::mutex mutex_; #endif // Low log(kSize) + 1 bits in front_ and back_ contain rolling index of @@ -1440,7 +1439,7 @@ class ThreadPoolTempl : public onnxruntime::concurrency::ExtendedThreadPoolInter ThreadStatus seen = GetStatus(); if (seen == ThreadStatus::Blocking || seen == ThreadStatus::Blocked) { - std::unique_lock lk(mutex); + std::unique_lock lk(mutex); // Blocking state exists only transiently during the SetBlock() method // while holding the lock. We may observe it at the start of this // function, but after acquiring the lock then the target thread @@ -1470,7 +1469,7 @@ class ThreadPoolTempl : public onnxruntime::concurrency::ExtendedThreadPoolInter void SetBlocked(std::function should_block, std::function post_block) { - std::unique_lock lk(mutex); + std::unique_lock lk(mutex); assert(GetStatus() == ThreadStatus::Spinning); status.store(ThreadStatus::Blocking, std::memory_order_relaxed); if (should_block()) { @@ -1485,8 +1484,8 @@ class ThreadPoolTempl : public onnxruntime::concurrency::ExtendedThreadPoolInter private: std::atomic status{ThreadStatus::Spinning}; - OrtMutex mutex; - OrtCondVar cv; + std::mutex mutex; + std::condition_variable cv; }; Environment& env_; diff --git a/include/onnxruntime/core/platform/ort_mutex.h b/include/onnxruntime/core/platform/ort_mutex.h deleted file mode 100644 index e24665f51423d..0000000000000 --- a/include/onnxruntime/core/platform/ort_mutex.h +++ /dev/null @@ -1,189 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#pragma once -#ifdef _WIN32 -#include -#include -namespace onnxruntime { -// Q: Why OrtMutex is better than std::mutex -// A: OrtMutex supports static initialization but std::mutex doesn't. Static initialization helps us prevent the "static -// initialization order problem". - -// Q: Why std::mutex can't make it? -// A: VC runtime has to support Windows XP at ABI level. But we don't have such requirement. - -// Q: Is OrtMutex faster than std::mutex? -// A: Sure - -class OrtMutex { - private: - SRWLOCK data_ = SRWLOCK_INIT; - - public: - constexpr OrtMutex() = default; - // SRW locks do not need to be explicitly destroyed. - ~OrtMutex() = default; - OrtMutex(const OrtMutex&) = delete; - OrtMutex& operator=(const OrtMutex&) = delete; - void lock() { AcquireSRWLockExclusive(native_handle()); } - bool try_lock() noexcept { return TryAcquireSRWLockExclusive(native_handle()) == TRUE; } - void unlock() noexcept { ReleaseSRWLockExclusive(native_handle()); } - using native_handle_type = SRWLOCK*; - - __forceinline native_handle_type native_handle() { return &data_; } -}; - -class OrtCondVar { - CONDITION_VARIABLE native_cv_object = CONDITION_VARIABLE_INIT; - - public: - constexpr OrtCondVar() noexcept = default; - ~OrtCondVar() = default; - - OrtCondVar(const OrtCondVar&) = delete; - OrtCondVar& operator=(const OrtCondVar&) = delete; - - void notify_one() noexcept { WakeConditionVariable(&native_cv_object); } - void notify_all() noexcept { WakeAllConditionVariable(&native_cv_object); } - - void wait(std::unique_lock& lk) { - if (SleepConditionVariableSRW(&native_cv_object, lk.mutex()->native_handle(), INFINITE, 0) != TRUE) { - std::terminate(); - } - } - template - void wait(std::unique_lock& __lk, _Predicate __pred); - - /** - * returns cv_status::timeout if the wait terminates when Rel_time has elapsed. Otherwise, the method returns - * cv_status::no_timeout. - * @param cond_mutex A unique_lock object. - * @param rel_time A chrono::duration object that specifies the amount of time before the thread wakes up. - * @return returns cv_status::timeout if the wait terminates when Rel_time has elapsed. Otherwise, the method returns - * cv_status::no_timeout - */ - template - std::cv_status wait_for(std::unique_lock& cond_mutex, const std::chrono::duration& rel_time); - using native_handle_type = CONDITION_VARIABLE*; - - native_handle_type native_handle() { return &native_cv_object; } - - private: - void timed_wait_impl(std::unique_lock& __lk, - std::chrono::time_point); -}; - -template -void OrtCondVar::wait(std::unique_lock& __lk, _Predicate __pred) { - while (!__pred()) wait(__lk); -} - -template -std::cv_status OrtCondVar::wait_for(std::unique_lock& cond_mutex, - const std::chrono::duration& rel_time) { - // TODO: is it possible to use nsync_from_time_point_ ? - using namespace std::chrono; - if (rel_time <= duration::zero()) - return std::cv_status::timeout; - using SystemTimePointFloat = time_point >; - using SystemTimePoint = time_point; - SystemTimePointFloat max_time = SystemTimePoint::max(); - steady_clock::time_point steady_now = steady_clock::now(); - system_clock::time_point system_now = system_clock::now(); - if (max_time - rel_time > system_now) { - nanoseconds remain = duration_cast(rel_time); - if (remain < rel_time) - ++remain; - timed_wait_impl(cond_mutex, system_now + remain); - } else - timed_wait_impl(cond_mutex, SystemTimePoint::max()); - return steady_clock::now() - steady_now < rel_time ? std::cv_status::no_timeout : std::cv_status::timeout; -} -} // namespace onnxruntime -#else -#include "nsync.h" -#include //for unique_lock -#include //for cv_status -namespace onnxruntime { - -class OrtMutex { - nsync::nsync_mu data_ = NSYNC_MU_INIT; - - public: - constexpr OrtMutex() = default; - ~OrtMutex() = default; - OrtMutex(const OrtMutex&) = delete; - OrtMutex& operator=(const OrtMutex&) = delete; - - void lock() { nsync::nsync_mu_lock(&data_); } - bool try_lock() noexcept { return nsync::nsync_mu_trylock(&data_) == 0; } - void unlock() noexcept { nsync::nsync_mu_unlock(&data_); } - - using native_handle_type = nsync::nsync_mu*; - native_handle_type native_handle() { return &data_; } -}; - -class OrtCondVar { - nsync::nsync_cv native_cv_object = NSYNC_CV_INIT; - - public: - constexpr OrtCondVar() noexcept = default; - - ~OrtCondVar() = default; - OrtCondVar(const OrtCondVar&) = delete; - OrtCondVar& operator=(const OrtCondVar&) = delete; - - void notify_one() noexcept { nsync::nsync_cv_signal(&native_cv_object); } - void notify_all() noexcept { nsync::nsync_cv_broadcast(&native_cv_object); } - - void wait(std::unique_lock& lk); - template - void wait(std::unique_lock& __lk, _Predicate __pred); - - /** - * returns cv_status::timeout if the wait terminates when Rel_time has elapsed. Otherwise, the method returns - * cv_status::no_timeout. - * @param cond_mutex A unique_lock object. - * @param rel_time A chrono::duration object that specifies the amount of time before the thread wakes up. - * @return returns cv_status::timeout if the wait terminates when Rel_time has elapsed. Otherwise, the method returns - * cv_status::no_timeout - */ - template - std::cv_status wait_for(std::unique_lock& cond_mutex, const std::chrono::duration& rel_time); - using native_handle_type = nsync::nsync_cv*; - native_handle_type native_handle() { return &native_cv_object; } - - private: - void timed_wait_impl(std::unique_lock& __lk, - std::chrono::time_point); -}; - -template -void OrtCondVar::wait(std::unique_lock& __lk, _Predicate __pred) { - while (!__pred()) wait(__lk); -} - -template -std::cv_status OrtCondVar::wait_for(std::unique_lock& cond_mutex, - const std::chrono::duration& rel_time) { - // TODO: is it possible to use nsync_from_time_point_ ? - using namespace std::chrono; - if (rel_time <= duration::zero()) - return std::cv_status::timeout; - using SystemTimePointFloat = time_point >; - using SystemTimePoint = time_point; - SystemTimePointFloat max_time = SystemTimePoint::max(); - steady_clock::time_point steady_now = steady_clock::now(); - system_clock::time_point system_now = system_clock::now(); - if (max_time - rel_time > system_now) { - nanoseconds remain = duration_cast(rel_time); - if (remain < rel_time) - ++remain; - timed_wait_impl(cond_mutex, system_now + remain); - } else - timed_wait_impl(cond_mutex, SystemTimePoint::max()); - return steady_clock::now() - steady_now < rel_time ? std::cv_status::no_timeout : std::cv_status::timeout; -} -}; // namespace onnxruntime -#endif diff --git a/onnxruntime/contrib_ops/cuda/fused_conv.cc b/onnxruntime/contrib_ops/cuda/fused_conv.cc index 279df73ee3d45..0554cc34933f1 100644 --- a/onnxruntime/contrib_ops/cuda/fused_conv.cc +++ b/onnxruntime/contrib_ops/cuda/fused_conv.cc @@ -348,7 +348,7 @@ class FusedConv : public onnxruntime::cuda::CudaKernel { } Status ComputeInternal(OpKernelContext* context) const override { - std::lock_guard lock(s_.mutex); + std::lock_guard lock(s_.mutex); auto cudnnHandle = this->GetCudnnHandle(context); ORT_RETURN_IF_ERROR(UpdateState(context, true)); if (s_.Y->Shape().Size() == 0) { diff --git a/onnxruntime/contrib_ops/rocm/fused_conv.cc b/onnxruntime/contrib_ops/rocm/fused_conv.cc index 63804f79a32fb..4f3be98d97f80 100644 --- a/onnxruntime/contrib_ops/rocm/fused_conv.cc +++ b/onnxruntime/contrib_ops/rocm/fused_conv.cc @@ -144,7 +144,7 @@ class FusedConv : public onnxruntime::rocm::Conv { } Status ComputeInternal(OpKernelContext* context) const override { - std::lock_guard lock(Base::s_.mutex); + std::lock_guard lock(Base::s_.mutex); ORT_RETURN_IF_ERROR(Base::UpdateState(context, true)); if (Base::s_.Y->Shape().Size() == 0) { @@ -342,7 +342,7 @@ class FusedConv : public onnxruntime::rocm::Conv { }; struct FusionPlanCache { - mutable OrtMutex mutex; + mutable std::mutex mutex; using HashKey = uint32_t; std::unordered_map cache_directory_; @@ -351,7 +351,7 @@ class FusedConv : public onnxruntime::rocm::Conv { FusionPlanCacheItem& FindOrCreateFusionPlanCache(HashKey key, std::function factory) { - std::lock_guard lock(mutex); + std::lock_guard lock(mutex); auto iter = cache_directory_.find(key); if (iter == cache_directory_.end()) { cache_directory_[key].fusion = std::make_unique(); diff --git a/onnxruntime/core/common/logging/logging.cc b/onnxruntime/core/common/logging/logging.cc index a086c90ea4b14..a79e7300cffce 100644 --- a/onnxruntime/core/common/logging/logging.cc +++ b/onnxruntime/core/common/logging/logging.cc @@ -64,13 +64,13 @@ LoggingManager* LoggingManager::GetDefaultInstance() { #pragma warning(disable : 26426) #endif -static OrtMutex& DefaultLoggerMutex() noexcept { - static OrtMutex mutex; +static std::mutex& DefaultLoggerMutex() noexcept { + static std::mutex mutex; return mutex; } Logger* LoggingManager::s_default_logger_ = nullptr; -OrtMutex sink_mutex_; +std::mutex sink_mutex_; #ifdef _MSC_VER #pragma warning(pop) @@ -107,7 +107,7 @@ LoggingManager::LoggingManager(std::unique_ptr sink, Severity default_min // lock mutex to create instance, and enable logging // this matches the mutex usage in Shutdown - std::lock_guard guard(DefaultLoggerMutex()); + std::lock_guard guard(DefaultLoggerMutex()); if (DefaultLoggerManagerInstance().load() != nullptr) { ORT_THROW("Only one instance of LoggingManager created with InstanceType::Default can exist at any point in time."); @@ -127,7 +127,7 @@ LoggingManager::LoggingManager(std::unique_ptr sink, Severity default_min LoggingManager::~LoggingManager() { if (owns_default_logger_) { // lock mutex to reset DefaultLoggerManagerInstance() and free default logger from this instance. - std::lock_guard guard(DefaultLoggerMutex()); + std::lock_guard guard(DefaultLoggerMutex()); #if ((__cplusplus >= 201703L) || (defined(_MSVC_LANG) && (_MSVC_LANG >= 201703L))) DefaultLoggerManagerInstance().store(nullptr, std::memory_order_release); #else @@ -283,7 +283,7 @@ Severity OverrideLevelWithEtw(Severity original_severity) { bool LoggingManager::AddSinkOfType(SinkType sink_type, std::function()> sinkFactory, logging::Severity severity) { - std::lock_guard guard(sink_mutex_); + std::lock_guard guard(sink_mutex_); if (sink_->GetType() != SinkType::CompositeSink) { // Current sink is not a composite, create a new composite sink and add the current sink to it auto new_composite = std::make_unique(); @@ -305,7 +305,7 @@ bool LoggingManager::AddSinkOfType(SinkType sink_type, std::function guard(sink_mutex_); + std::lock_guard guard(sink_mutex_); if (sink_->GetType() == SinkType::CompositeSink) { auto composite_sink = static_cast(sink_.get()); diff --git a/onnxruntime/core/common/profiler.cc b/onnxruntime/core/common/profiler.cc index 71bca6ef3b582..8562e5524af74 100644 --- a/onnxruntime/core/common/profiler.cc +++ b/onnxruntime/core/common/profiler.cc @@ -85,7 +85,7 @@ void Profiler::EndTimeAndRecordEvent(EventCategory category, custom_logger_->SendProfileEvent(event); } else { // TODO: sync_gpu if needed. - std::lock_guard lock(mutex_); + std::lock_guard lock(mutex_); if (events_.size() < max_num_events_) { events_.emplace_back(std::move(event)); } else { @@ -115,7 +115,7 @@ std::string Profiler::EndProfiling() { LOGS(*session_logger_, INFO) << "Writing profiler data to file " << profile_stream_file_; } - std::lock_guard lock(mutex_); + std::lock_guard lock(mutex_); profile_stream_ << "[\n"; for (const auto& ep_profiler : ep_profilers_) { diff --git a/onnxruntime/core/common/profiler.h b/onnxruntime/core/common/profiler.h index a0bca0007b245..0103d8abb151f 100644 --- a/onnxruntime/core/common/profiler.h +++ b/onnxruntime/core/common/profiler.h @@ -11,7 +11,7 @@ #include "core/common/profiler_common.h" #include "core/common/logging/logging.h" -#include "core/platform/ort_mutex.h" +#include namespace onnxruntime { @@ -130,7 +130,7 @@ class Profiler { static std::atomic global_max_num_events_; // Mutex controlling access to profiler data - OrtMutex mutex_; + std::mutex mutex_; bool enabled_{false}; #if defined(__wasm__) /* diff --git a/onnxruntime/core/common/threadpool.cc b/onnxruntime/core/common/threadpool.cc index 7b62de799b6fc..b192688373851 100644 --- a/onnxruntime/core/common/threadpool.cc +++ b/onnxruntime/core/common/threadpool.cc @@ -21,9 +21,10 @@ limitations under the License. #include "core/common/cpuid_info.h" #include "core/common/eigen_common_wrapper.h" #include "core/platform/EigenNonBlockingThreadPool.h" -#include "core/platform/ort_mutex.h" +#include #if !defined(ORT_MINIMAL_BUILD) #ifdef _WIN32 +#include #include "processthreadsapi.h" #include #include diff --git a/onnxruntime/core/framework/bfc_arena.cc b/onnxruntime/core/framework/bfc_arena.cc index 13f9656ae0595..6788b4af3b982 100644 --- a/onnxruntime/core/framework/bfc_arena.cc +++ b/onnxruntime/core/framework/bfc_arena.cc @@ -276,7 +276,7 @@ void* BFCArena::Reserve(size_t size) { if (size == 0) return nullptr; - std::lock_guard lock(lock_); + std::lock_guard lock(lock_); LOGS_DEFAULT(INFO) << "Reserving memory in BFCArena for " << device_allocator_->Info().name << " size: " << size; @@ -293,7 +293,7 @@ void* BFCArena::Reserve(size_t size) { } size_t BFCArena::RequestedSize(const void* ptr) { - std::lock_guard lock(lock_); + std::lock_guard lock(lock_); BFCArena::ChunkHandle h = region_manager_.get_handle(ptr); ORT_ENFORCE(h != kInvalidChunkHandle); BFCArena::Chunk* c = ChunkFromHandle(h); @@ -301,7 +301,7 @@ size_t BFCArena::RequestedSize(const void* ptr) { } size_t BFCArena::AllocatedSize(const void* ptr) { - std::lock_guard lock(lock_); + std::lock_guard lock(lock_); BFCArena::ChunkHandle h = region_manager_.get_handle(ptr); ORT_ENFORCE(h != kInvalidChunkHandle); BFCArena::Chunk* c = ChunkFromHandle(h); @@ -325,7 +325,7 @@ void* BFCArena::AllocateRawInternal(size_t num_bytes, // The BFC allocator tries to find the best fit first. BinNum bin_num = BinNumForSize(rounded_bytes); - std::lock_guard lock(lock_); + std::lock_guard lock(lock_); // search for a valid chunk auto* chunk = FindChunkPtr(bin_num, rounded_bytes, @@ -377,7 +377,7 @@ void* BFCArena::AllocateRawInternal(size_t num_bytes, } void BFCArena::GetStats(AllocatorStats* stats) { - std::lock_guard lock(lock_); + std::lock_guard lock(lock_); *stats = stats_; } @@ -496,7 +496,7 @@ void BFCArena::Free(void* p) { if (p == nullptr) { return; } - std::lock_guard lock(lock_); + std::lock_guard lock(lock_); auto it = reserved_chunks_.find(p); if (it != reserved_chunks_.end()) { device_allocator_->Free(it->first); @@ -509,7 +509,7 @@ void BFCArena::Free(void* p) { } Status BFCArena::Shrink() { - std::lock_guard lock(lock_); + std::lock_guard lock(lock_); auto num_regions = region_manager_.regions().size(); std::vector region_ptrs; std::vector region_sizes; @@ -807,7 +807,7 @@ void BFCArena::DumpMemoryLog(size_t num_bytes) { } #ifdef ORT_ENABLE_STREAM void BFCArena::ResetChunkOnTargetStream(Stream* target_stream, bool coalesce_flag) { - std::lock_guard lock(lock_); + std::lock_guard lock(lock_); for (const auto& region : region_manager_.regions()) { ChunkHandle region_begin_chunk = region_manager_.get_handle(region.ptr()); diff --git a/onnxruntime/core/framework/bfc_arena.h b/onnxruntime/core/framework/bfc_arena.h index 5e4cd9f62f11b..8081738f2a5dc 100644 --- a/onnxruntime/core/framework/bfc_arena.h +++ b/onnxruntime/core/framework/bfc_arena.h @@ -27,7 +27,7 @@ limitations under the License. #include "core/common/logging/severity.h" #include "core/common/safeint.h" -#include "core/platform/ort_mutex.h" +#include #include "core/framework/arena_extend_strategy.h" #include "core/framework/allocator.h" @@ -489,7 +489,7 @@ class BFCArena : public IAllocator { std::unique_ptr device_allocator_; - mutable OrtMutex lock_; + mutable std::mutex lock_; RegionManager region_manager_; std::vector chunks_; diff --git a/onnxruntime/core/framework/execution_providers.h b/onnxruntime/core/framework/execution_providers.h index 43fe92edc9dfe..29cf79ec385d8 100644 --- a/onnxruntime/core/framework/execution_providers.h +++ b/onnxruntime/core/framework/execution_providers.h @@ -12,6 +12,7 @@ #include "core/graph/graph_viewer.h" #include "core/common/logging/logging.h" #ifdef _WIN32 +#include #include #include #include "core/platform/tracing.h" diff --git a/onnxruntime/core/framework/kernel_registry_manager.h b/onnxruntime/core/framework/kernel_registry_manager.h index 201fda6d978b6..1da73208cb536 100644 --- a/onnxruntime/core/framework/kernel_registry_manager.h +++ b/onnxruntime/core/framework/kernel_registry_manager.h @@ -12,7 +12,7 @@ #include "core/common/status.h" #include "core/framework/kernel_type_str_resolver.h" #include "core/graph/graph_viewer.h" -#include "core/platform/ort_mutex.h" +#include namespace onnxruntime { struct KernelCreateInfo; diff --git a/onnxruntime/core/framework/kernel_type_str_resolver.h b/onnxruntime/core/framework/kernel_type_str_resolver.h index 587be491b360a..a642649eca341 100644 --- a/onnxruntime/core/framework/kernel_type_str_resolver.h +++ b/onnxruntime/core/framework/kernel_type_str_resolver.h @@ -18,7 +18,7 @@ #include "core/common/status.h" #include "core/graph/op_identifier.h" #include "core/graph/graph.h" -#include "core/platform/ort_mutex.h" +#include namespace onnxruntime { @@ -129,7 +129,7 @@ class OpSchemaKernelTypeStrResolver final : public IKernelTypeStrResolver { // used as a cache when resolving // since the cache may be modified with a const instance, ensure that access to the cache is thread-safe mutable KernelTypeStrResolver resolver_; - mutable OrtMutex resolver_mutex_; + mutable std::mutex resolver_mutex_; }; #endif // !defined(ORT_MINIMAL_BUILD) diff --git a/onnxruntime/core/framework/mem_pattern_planner.h b/onnxruntime/core/framework/mem_pattern_planner.h index f4db5d9f1c75f..e4353ec22db92 100644 --- a/onnxruntime/core/framework/mem_pattern_planner.h +++ b/onnxruntime/core/framework/mem_pattern_planner.h @@ -20,7 +20,7 @@ limitations under the License. #include "core/common/safeint.h" #include "core/framework/mem_pattern.h" #include "core/framework/allocation_planner.h" -#include "core/platform/ort_mutex.h" +#include namespace onnxruntime { // MemPatternPlanner is used to trace allocation/free steps @@ -68,7 +68,7 @@ class MemPatternPlanner { void TraceAllocation(int ml_value_idx, const AllocPlanPerValue::ProgramCounter& counter, size_t size) { ORT_ENFORCE(using_counters_); - std::lock_guard lock(lock_); + std::lock_guard lock(lock_); if (size == 0) { allocs_.emplace_back(ml_value_idx, MemoryBlock(0, 0)); @@ -133,7 +133,7 @@ class MemPatternPlanner { void TraceAllocation(int ml_value_idx, size_t size) { ORT_ENFORCE(!using_counters_); - std::lock_guard lock(lock_); + std::lock_guard lock(lock_); if (size == 0) { allocs_.emplace_back(ml_value_idx, MemoryBlock(0, 0)); @@ -190,7 +190,7 @@ class MemPatternPlanner { } void TraceFree(int ml_value_index) { - std::lock_guard lock(lock_); + std::lock_guard lock(lock_); for (auto it = blocks_.begin(); it != blocks_.end(); it++) { if (allocs_[*it].index_ == ml_value_index) { @@ -201,7 +201,7 @@ class MemPatternPlanner { } MemoryPattern GenerateMemPattern() const { - std::lock_guard lock(lock_); + std::lock_guard lock(lock_); #ifdef ENABLE_TRAINING if (using_counters_) { @@ -261,7 +261,7 @@ class MemPatternPlanner { std::list blocks_; SafeInt buffer_size_{0}; bool using_counters_; - mutable OrtMutex lock_; + mutable std::mutex lock_; }; } // namespace onnxruntime diff --git a/onnxruntime/core/framework/model_metadef_id_generator.cc b/onnxruntime/core/framework/model_metadef_id_generator.cc index 8b1d1f4f304c9..4a35052d159a0 100644 --- a/onnxruntime/core/framework/model_metadef_id_generator.cc +++ b/onnxruntime/core/framework/model_metadef_id_generator.cc @@ -2,7 +2,7 @@ // Licensed under the MIT License. #include #include "model_metadef_id_generator.h" -#include "core/platform/ort_mutex.h" +#include #include "core/graph/graph_viewer.h" #include "core/framework/murmurhash3.h" @@ -11,8 +11,8 @@ int ModelMetadefIdGenerator::GenerateId(const onnxruntime::GraphViewer& graph_vi HashValue& model_hash) const { // if the EP is shared across multiple sessions there's a very small potential for concurrency issues. // use a lock when generating an id to be paranoid - static OrtMutex mutex; - std::lock_guard lock(mutex); + static std::mutex mutex; + std::lock_guard lock(mutex); model_hash = 0; // find the top level graph diff --git a/onnxruntime/core/framework/prepacked_weights_container.h b/onnxruntime/core/framework/prepacked_weights_container.h index 7fe317b6c4317..37fc01c05f2ae 100644 --- a/onnxruntime/core/framework/prepacked_weights_container.h +++ b/onnxruntime/core/framework/prepacked_weights_container.h @@ -11,7 +11,7 @@ #include "core/framework/buffer_deleter.h" #include "core/framework/allocator.h" -#include "core/platform/ort_mutex.h" +#include #include "prepacked_weights.h" namespace onnxruntime { @@ -53,7 +53,7 @@ class PrepackedWeightsContainer final { // PrePack() methods and does the read/write into the pre-packed weights' container. // We only want to invoke PrePack() on a kernel that doesn't have a cached version // of its pre-packed weight. - OrtMutex mutex_; + std::mutex mutex_; // Define allocators ahead of the container containing tensors because the allocators // needs to destructed after the container containing the pre-packed cached tensors diff --git a/onnxruntime/core/framework/random_generator.h b/onnxruntime/core/framework/random_generator.h index 39f31b2f9af8a..b0aa3df09ca62 100644 --- a/onnxruntime/core/framework/random_generator.h +++ b/onnxruntime/core/framework/random_generator.h @@ -7,7 +7,7 @@ #include #include -#include "core/platform/ort_mutex.h" +#include namespace onnxruntime { @@ -57,7 +57,7 @@ class PhiloxGenerator { * Resets the seed and offset. */ void SetSeed(uint64_t seed) { - std::lock_guard lock(mutex_); + std::lock_guard lock(mutex_); seed_ = seed; offset_ = 0; } @@ -66,7 +66,7 @@ class PhiloxGenerator { * Gets the seed and offset pair, incrementing the offset by the specified count. */ std::pair NextPhiloxSeeds(uint64_t count) { - std::lock_guard lock(mutex_); + std::lock_guard lock(mutex_); auto seeds = std::make_pair(seed_, offset_); offset_ += count; return seeds; @@ -79,7 +79,7 @@ class PhiloxGenerator { static PhiloxGenerator& Default(); private: - OrtMutex mutex_; + std::mutex mutex_; uint64_t seed_; uint64_t offset_; }; diff --git a/onnxruntime/core/framework/session_state.cc b/onnxruntime/core/framework/session_state.cc index 4df0370ac719e..0d0b22ff61e01 100644 --- a/onnxruntime/core/framework/session_state.cc +++ b/onnxruntime/core/framework/session_state.cc @@ -5,7 +5,7 @@ #include -#include "core/platform/ort_mutex.h" +#include #include "core/common/logging/logging.h" #include "core/common/safeint.h" #include "core/flatbuffers/schema/ort.fbs.h" @@ -518,7 +518,7 @@ Status SessionState::PrepackConstantInitializedTensors(InlinedHashMap l(prepacked_weights_container_->mutex_); + std::lock_guard l(prepacked_weights_container_->mutex_); return prepacked_constant_weights(true); } else { return prepacked_constant_weights(false); @@ -775,7 +775,7 @@ const MemoryPatternGroup* SessionState::GetMemoryPatternGroup( const InlinedHashMap*& out_inferred_shapes) const { out_inferred_shapes = nullptr; int64_t key = CalculateMemoryPatternsKey(tensor_inputs); - std::lock_guard lock(mem_patterns_lock_); + std::lock_guard lock(mem_patterns_lock_); auto it = mem_patterns_.find(key); if (it == mem_patterns_.end()) { #ifdef ENABLE_TRAINING @@ -851,7 +851,7 @@ Status SessionState::UpdateMemoryPatternGroupCache(gsl::span ten MemoryPatternGroup mem_patterns) const { int64_t key = CalculateMemoryPatternsKey(tensor_inputs); - std::lock_guard lock(mem_patterns_lock_); + std::lock_guard lock(mem_patterns_lock_); // Do not update if present, as the pointer to the existing one is cached mem_patterns_.emplace(key, std::move(mem_patterns)); return Status::OK(); @@ -1588,7 +1588,7 @@ static void BindToDeviceStream(const SequentialExecutionPlan& execution_plan, std::unique_ptr SessionState::AcquireDeviceStreamCollection() const { if (has_device_stream_enabled_ep_) { - std::lock_guard lock(device_stream_pool_mutex_); + std::lock_guard lock(device_stream_pool_mutex_); if (!device_stream_pool_.empty()) { auto device_stream = std::move(device_stream_pool_.back()); device_stream_pool_.pop_back(); @@ -1607,7 +1607,7 @@ std::unique_ptr SessionState::AcquireDeviceStreamCollect void SessionState::RecycleDeviceStreamCollection(std::unique_ptr device_stream_collection) const { // if no need to reuse the device stream, don't perform the recycle if (has_device_stream_enabled_ep_) { - std::lock_guard lock(device_stream_pool_mutex_); + std::lock_guard lock(device_stream_pool_mutex_); device_stream_pool_.push_back(std::move(device_stream_collection)); } else { device_stream_collection.reset(nullptr); diff --git a/onnxruntime/core/framework/session_state.h b/onnxruntime/core/framework/session_state.h index 5b7f6dc5cb867..e1674ba4b690b 100644 --- a/onnxruntime/core/framework/session_state.h +++ b/onnxruntime/core/framework/session_state.h @@ -35,7 +35,7 @@ #include "core/framework/ort_value_name_idx_map.h" #include "core/graph/graph_viewer.h" #include "core/graph/onnx_protobuf.h" -#include "core/platform/ort_mutex.h" +#include #include "core/platform/path_lib.h" #include "core/platform/threadpool.h" #if !defined(ORT_MINIMAL_BUILD) && defined(ORT_MEMORY_PROFILE) @@ -494,7 +494,7 @@ class SessionState { bool enable_mem_pattern_; // lock for the mem_patterns_ - mutable OrtMutex mem_patterns_lock_; + mutable std::mutex mem_patterns_lock_; // cache for the generated mem_patterns. key is calculated based on input shapes. // must be a node based container as a pointer is cached. mutable NodeHashMap mem_patterns_; @@ -568,7 +568,7 @@ class SessionState { std::unique_ptr stream_handles_registry_; // lock for the device stream pool - mutable OrtMutex device_stream_pool_mutex_; + mutable std::mutex device_stream_pool_mutex_; mutable std::vector> device_stream_pool_; // flag to indicate whether current session using any EP that create device stream dynamically. bool has_device_stream_enabled_ep_ = false; diff --git a/onnxruntime/core/framework/tuning_context.h b/onnxruntime/core/framework/tuning_context.h index 304fffa4ab7ca..96657d482d3a8 100644 --- a/onnxruntime/core/framework/tuning_context.h +++ b/onnxruntime/core/framework/tuning_context.h @@ -7,7 +7,7 @@ #include #include "core/common/common.h" -#include "core/platform/ort_mutex.h" +#include #include "core/framework/allocator.h" #include "core/framework/tuning_results.h" @@ -77,7 +77,7 @@ class TuningResultsManager { void Clear(); private: - mutable OrtMutex lock_; + mutable std::mutex lock_; std::unordered_map results_; }; diff --git a/onnxruntime/core/graph/schema_registry.cc b/onnxruntime/core/graph/schema_registry.cc index a7d94f4571d96..496825f00d452 100644 --- a/onnxruntime/core/graph/schema_registry.cc +++ b/onnxruntime/core/graph/schema_registry.cc @@ -10,7 +10,7 @@ common::Status OnnxRuntimeOpSchemaRegistry::SetBaselineAndOpsetVersionForDomain( const std::string& domain, int baseline_opset_version, int opset_version) { - std::lock_guard lock(mutex_); + std::lock_guard lock(mutex_); auto it = domain_version_range_map_.find(domain); if (domain_version_range_map_.end() != it) { diff --git a/onnxruntime/core/optimizer/pre_shape_node_elimination.cc b/onnxruntime/core/optimizer/pre_shape_node_elimination.cc index 23980c9c10e6b..8f50ef7c09c95 100644 --- a/onnxruntime/core/optimizer/pre_shape_node_elimination.cc +++ b/onnxruntime/core/optimizer/pre_shape_node_elimination.cc @@ -48,7 +48,7 @@ bool PreShapeNodeElimination::SatisfyCondition(const Graph& graph, const Node& n for (const Node* next_node : output_nodes) { // Check if the next node is not of type "Shape" - if (!graph_utils::IsSupportedOptypeVersionAndDomain(*next_node, "Shape", {13, 15, 19}, kOnnxDomain)) { + if (!next_node || !graph_utils::IsSupportedOptypeVersionAndDomain(*next_node, "Shape", {13, 15, 19}, kOnnxDomain)) { return false; } } diff --git a/onnxruntime/core/platform/posix/ort_mutex.cc b/onnxruntime/core/platform/posix/ort_mutex.cc deleted file mode 100644 index e124ce168085f..0000000000000 --- a/onnxruntime/core/platform/posix/ort_mutex.cc +++ /dev/null @@ -1,42 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#include "core/common/common.h" -#include "core/platform/ort_mutex.h" -#include -#include -#include - -namespace onnxruntime { -void OrtCondVar::timed_wait_impl(std::unique_lock& lk, - std::chrono::time_point tp) { - using namespace std::chrono; -#ifndef NDEBUG - if (!lk.owns_lock()) - ORT_THROW("condition_variable::timed wait: mutex not locked"); -#endif - nanoseconds d = tp.time_since_epoch(); - timespec abs_deadline; - seconds s = duration_cast(d); - using ts_sec = decltype(abs_deadline.tv_sec); - constexpr ts_sec ts_sec_max = std::numeric_limits::max(); - if (s.count() < ts_sec_max) { - abs_deadline.tv_sec = static_cast(s.count()); - abs_deadline.tv_nsec = static_cast((d - s).count()); - } else { - abs_deadline.tv_sec = ts_sec_max; - abs_deadline.tv_nsec = 999999999; - } - nsync::nsync_cv_wait_with_deadline(&native_cv_object, lk.mutex()->native_handle(), abs_deadline, nullptr); -} - -void OrtCondVar::wait(std::unique_lock& lk) { -#ifndef NDEBUG - if (!lk.owns_lock()) { - ORT_THROW("OrtCondVar wait failed: mutex not locked"); - } -#endif - nsync::nsync_cv_wait(&native_cv_object, lk.mutex()->native_handle()); -} - -} // namespace onnxruntime \ No newline at end of file diff --git a/onnxruntime/core/platform/windows/logging/etw_sink.cc b/onnxruntime/core/platform/windows/logging/etw_sink.cc index 889bc6fcf86df..bf73a538ea42f 100644 --- a/onnxruntime/core/platform/windows/logging/etw_sink.cc +++ b/onnxruntime/core/platform/windows/logging/etw_sink.cc @@ -65,12 +65,12 @@ EtwRegistrationManager& EtwRegistrationManager::Instance() { } bool EtwRegistrationManager::IsEnabled() const { - std::lock_guard lock(provider_change_mutex_); + std::lock_guard lock(provider_change_mutex_); return is_enabled_; } UCHAR EtwRegistrationManager::Level() const { - std::lock_guard lock(provider_change_mutex_); + std::lock_guard lock(provider_change_mutex_); return level_; } @@ -94,7 +94,7 @@ Severity EtwRegistrationManager::MapLevelToSeverity() { } ULONGLONG EtwRegistrationManager::Keyword() const { - std::lock_guard lock(provider_change_mutex_); + std::lock_guard lock(provider_change_mutex_); return keyword_; } @@ -103,12 +103,12 @@ HRESULT EtwRegistrationManager::Status() const { } void EtwRegistrationManager::RegisterInternalCallback(const EtwInternalCallback& callback) { - std::lock_guard lock(callbacks_mutex_); + std::lock_guard lock(callbacks_mutex_); callbacks_.push_back(&callback); } void EtwRegistrationManager::UnregisterInternalCallback(const EtwInternalCallback& callback) { - std::lock_guard lock(callbacks_mutex_); + std::lock_guard lock(callbacks_mutex_); auto new_end = std::remove_if(callbacks_.begin(), callbacks_.end(), [&callback](const EtwInternalCallback* ptr) { return ptr == &callback; @@ -126,7 +126,7 @@ void NTAPI EtwRegistrationManager::ORT_TL_EtwEnableCallback( _In_opt_ PVOID CallbackContext) { auto& manager = EtwRegistrationManager::Instance(); { - std::lock_guard lock(manager.provider_change_mutex_); + std::lock_guard lock(manager.provider_change_mutex_); manager.is_enabled_ = (IsEnabled != 0); manager.level_ = Level; manager.keyword_ = MatchAnyKeyword; @@ -135,11 +135,11 @@ void NTAPI EtwRegistrationManager::ORT_TL_EtwEnableCallback( } EtwRegistrationManager::~EtwRegistrationManager() { - std::lock_guard lock(callbacks_mutex_); + std::lock_guard lock(callbacks_mutex_); callbacks_.clear(); if (initialization_status_ == InitializationStatus::Initialized || initialization_status_ == InitializationStatus::Initializing) { - std::lock_guard init_lock(init_mutex_); + std::lock_guard init_lock(init_mutex_); assert(initialization_status_ != InitializationStatus::Initializing); if (initialization_status_ == InitializationStatus::Initialized) { ::TraceLoggingUnregister(etw_provider_handle); @@ -153,7 +153,7 @@ EtwRegistrationManager::EtwRegistrationManager() { void EtwRegistrationManager::LazyInitialize() { if (initialization_status_ == InitializationStatus::NotInitialized) { - std::lock_guard lock(init_mutex_); + std::lock_guard lock(init_mutex_); if (initialization_status_ == InitializationStatus::NotInitialized) { // Double-check locking pattern initialization_status_ = InitializationStatus::Initializing; etw_status_ = ::TraceLoggingRegisterEx(etw_provider_handle, ORT_TL_EtwEnableCallback, nullptr); @@ -174,7 +174,7 @@ void EtwRegistrationManager::InvokeCallbacks(LPCGUID SourceId, ULONG IsEnabled, return; } - std::lock_guard lock(callbacks_mutex_); + std::lock_guard lock(callbacks_mutex_); for (const auto& callback : callbacks_) { (*callback)(SourceId, IsEnabled, Level, MatchAnyKeyword, MatchAllKeyword, FilterData, CallbackContext); } diff --git a/onnxruntime/core/platform/windows/logging/etw_sink.h b/onnxruntime/core/platform/windows/logging/etw_sink.h index d6c9ea27b2955..2a798a28f13de 100644 --- a/onnxruntime/core/platform/windows/logging/etw_sink.h +++ b/onnxruntime/core/platform/windows/logging/etw_sink.h @@ -24,7 +24,7 @@ #include "core/common/logging/capture.h" #include "core/common/logging/isink.h" -#include "core/platform/ort_mutex.h" +#include namespace onnxruntime { namespace logging { @@ -98,9 +98,9 @@ class EtwRegistrationManager { _In_opt_ PVOID CallbackContext); std::vector callbacks_; - OrtMutex callbacks_mutex_; - mutable OrtMutex provider_change_mutex_; - OrtMutex init_mutex_; + std::mutex callbacks_mutex_; + mutable std::mutex provider_change_mutex_; + std::mutex init_mutex_; InitializationStatus initialization_status_ = InitializationStatus::NotInitialized; bool is_enabled_; UCHAR level_; diff --git a/onnxruntime/core/platform/windows/telemetry.cc b/onnxruntime/core/platform/windows/telemetry.cc index 86067d377205b..47789af9d5a47 100644 --- a/onnxruntime/core/platform/windows/telemetry.cc +++ b/onnxruntime/core/platform/windows/telemetry.cc @@ -2,7 +2,7 @@ // Licensed under the MIT License. #include "core/platform/windows/telemetry.h" -#include "core/platform/ort_mutex.h" +#include #include "core/common/logging/logging.h" #include "onnxruntime_config.h" @@ -57,18 +57,18 @@ TRACELOGGING_DEFINE_PROVIDER(telemetry_provider_handle, "Microsoft.ML.ONNXRuntim #pragma warning(pop) #endif -OrtMutex WindowsTelemetry::mutex_; -OrtMutex WindowsTelemetry::provider_change_mutex_; +std::mutex WindowsTelemetry::mutex_; +std::mutex WindowsTelemetry::provider_change_mutex_; uint32_t WindowsTelemetry::global_register_count_ = 0; bool WindowsTelemetry::enabled_ = true; uint32_t WindowsTelemetry::projection_ = 0; UCHAR WindowsTelemetry::level_ = 0; UINT64 WindowsTelemetry::keyword_ = 0; std::vector WindowsTelemetry::callbacks_; -OrtMutex WindowsTelemetry::callbacks_mutex_; +std::mutex WindowsTelemetry::callbacks_mutex_; WindowsTelemetry::WindowsTelemetry() { - std::lock_guard lock(mutex_); + std::lock_guard lock(mutex_); if (global_register_count_ == 0) { // TraceLoggingRegister is fancy in that you can only register once GLOBALLY for the whole process HRESULT hr = TraceLoggingRegisterEx(telemetry_provider_handle, ORT_TL_EtwEnableCallback, nullptr); @@ -79,7 +79,7 @@ WindowsTelemetry::WindowsTelemetry() { } WindowsTelemetry::~WindowsTelemetry() { - std::lock_guard lock(mutex_); + std::lock_guard lock(mutex_); if (global_register_count_ > 0) { global_register_count_ -= 1; if (global_register_count_ == 0) { @@ -87,22 +87,22 @@ WindowsTelemetry::~WindowsTelemetry() { } } - std::lock_guard lock_callbacks(callbacks_mutex_); + std::lock_guard lock_callbacks(callbacks_mutex_); callbacks_.clear(); } bool WindowsTelemetry::IsEnabled() const { - std::lock_guard lock(provider_change_mutex_); + std::lock_guard lock(provider_change_mutex_); return enabled_; } UCHAR WindowsTelemetry::Level() const { - std::lock_guard lock(provider_change_mutex_); + std::lock_guard lock(provider_change_mutex_); return level_; } UINT64 WindowsTelemetry::Keyword() const { - std::lock_guard lock(provider_change_mutex_); + std::lock_guard lock(provider_change_mutex_); return keyword_; } @@ -111,12 +111,12 @@ UINT64 WindowsTelemetry::Keyword() const { // } void WindowsTelemetry::RegisterInternalCallback(const EtwInternalCallback& callback) { - std::lock_guard lock_callbacks(callbacks_mutex_); + std::lock_guard lock_callbacks(callbacks_mutex_); callbacks_.push_back(&callback); } void WindowsTelemetry::UnregisterInternalCallback(const EtwInternalCallback& callback) { - std::lock_guard lock_callbacks(callbacks_mutex_); + std::lock_guard lock_callbacks(callbacks_mutex_); auto new_end = std::remove_if(callbacks_.begin(), callbacks_.end(), [&callback](const EtwInternalCallback* ptr) { return ptr == &callback; @@ -132,7 +132,7 @@ void NTAPI WindowsTelemetry::ORT_TL_EtwEnableCallback( _In_ ULONGLONG MatchAllKeyword, _In_opt_ PEVENT_FILTER_DESCRIPTOR FilterData, _In_opt_ PVOID CallbackContext) { - std::lock_guard lock(provider_change_mutex_); + std::lock_guard lock(provider_change_mutex_); enabled_ = (IsEnabled != 0); level_ = Level; keyword_ = MatchAnyKeyword; @@ -143,7 +143,7 @@ void NTAPI WindowsTelemetry::ORT_TL_EtwEnableCallback( void WindowsTelemetry::InvokeCallbacks(LPCGUID SourceId, ULONG IsEnabled, UCHAR Level, ULONGLONG MatchAnyKeyword, ULONGLONG MatchAllKeyword, PEVENT_FILTER_DESCRIPTOR FilterData, PVOID CallbackContext) { - std::lock_guard lock_callbacks(callbacks_mutex_); + std::lock_guard lock_callbacks(callbacks_mutex_); for (const auto& callback : callbacks_) { (*callback)(SourceId, IsEnabled, Level, MatchAnyKeyword, MatchAllKeyword, FilterData, CallbackContext); } diff --git a/onnxruntime/core/platform/windows/telemetry.h b/onnxruntime/core/platform/windows/telemetry.h index ed80f13e633ac..b23a60a44b5f0 100644 --- a/onnxruntime/core/platform/windows/telemetry.h +++ b/onnxruntime/core/platform/windows/telemetry.h @@ -8,7 +8,7 @@ #include "core/platform/telemetry.h" #include #include -#include "core/platform/ort_mutex.h" +#include #include "core/platform/windows/TraceLoggingConfig.h" namespace onnxruntime { @@ -69,14 +69,14 @@ class WindowsTelemetry : public Telemetry { static void UnregisterInternalCallback(const EtwInternalCallback& callback); private: - static OrtMutex mutex_; + static std::mutex mutex_; static uint32_t global_register_count_; static bool enabled_; static uint32_t projection_; static std::vector callbacks_; - static OrtMutex callbacks_mutex_; - static OrtMutex provider_change_mutex_; + static std::mutex callbacks_mutex_; + static std::mutex provider_change_mutex_; static UCHAR level_; static ULONGLONG keyword_; diff --git a/onnxruntime/core/providers/cann/cann_allocator.h b/onnxruntime/core/providers/cann/cann_allocator.h index 15fa7b177904a..1022374b51d9f 100644 --- a/onnxruntime/core/providers/cann/cann_allocator.h +++ b/onnxruntime/core/providers/cann/cann_allocator.h @@ -6,7 +6,7 @@ #include "core/common/inlined_containers.h" #include "core/framework/allocator.h" -#include "core/platform/ort_mutex.h" +#include namespace onnxruntime { diff --git a/onnxruntime/core/providers/cann/cann_execution_provider.cc b/onnxruntime/core/providers/cann/cann_execution_provider.cc index 9a242919665bb..a799ed743ef52 100644 --- a/onnxruntime/core/providers/cann/cann_execution_provider.cc +++ b/onnxruntime/core/providers/cann/cann_execution_provider.cc @@ -28,7 +28,7 @@ using onnxruntime::common::Status; namespace onnxruntime { // Models can only be parsed and built serially in the same process -OrtMutex g_mutex; +std::mutex g_mutex; class Memcpy final : public OpKernel { public: @@ -1389,7 +1389,7 @@ Status CANNExecutionProvider::Compile(const std::vector& fuse if (modelIDs_.find(filename) != modelIDs_.end()) { modelID = modelIDs_[filename]; } else { - std::lock_guard lock(g_mutex); + std::lock_guard lock(g_mutex); if (cann::FileExist(filename_with_suffix)) { CANN_RETURN_IF_ERROR(aclmdlLoadFromFile(filename_with_suffix.c_str(), &modelID)); diff --git a/onnxruntime/core/providers/cann/cann_execution_provider.h b/onnxruntime/core/providers/cann/cann_execution_provider.h index d83bd88d6958f..7debfa72778fd 100644 --- a/onnxruntime/core/providers/cann/cann_execution_provider.h +++ b/onnxruntime/core/providers/cann/cann_execution_provider.h @@ -12,7 +12,7 @@ #include "core/providers/shared_library/provider_api.h" #include "core/framework/arena_extend_strategy.h" #include "core/framework/execution_provider.h" -#include "core/platform/ort_mutex.h" +#include #include "core/providers/cann/cann_execution_provider_info.h" #include "core/providers/cann/cann_inc.h" #include "core/providers/cann/cann_utils.h" diff --git a/onnxruntime/core/providers/cann/cann_kernel.h b/onnxruntime/core/providers/cann/cann_kernel.h index 90180144202a7..5effbb4f56043 100644 --- a/onnxruntime/core/providers/cann/cann_kernel.h +++ b/onnxruntime/core/providers/cann/cann_kernel.h @@ -4,7 +4,7 @@ #pragma once -#include "core/platform/ort_mutex.h" +#include #include "core/providers/cann/cann_inc.h" #include "core/providers/cann/cann_call.h" #include "core/providers/cann/cann_execution_provider.h" diff --git a/onnxruntime/core/providers/coreml/coreml_execution_provider.cc b/onnxruntime/core/providers/coreml/coreml_execution_provider.cc index b7d9211e0a9c2..f7afbb2f98bd8 100644 --- a/onnxruntime/core/providers/coreml/coreml_execution_provider.cc +++ b/onnxruntime/core/providers/coreml/coreml_execution_provider.cc @@ -218,7 +218,7 @@ common::Status CoreMLExecutionProvider::Compile(const std::vector lock(model->GetMutex()); + std::unique_lock lock(model->GetMutex()); std::unordered_map outputs; outputs.reserve(model_outputs.size()); diff --git a/onnxruntime/core/providers/coreml/model/model.h b/onnxruntime/core/providers/coreml/model/model.h index 75b9aaf2185c9..7fdd6b25bc7db 100644 --- a/onnxruntime/core/providers/coreml/model/model.h +++ b/onnxruntime/core/providers/coreml/model/model.h @@ -11,7 +11,7 @@ #include #include "core/common/logging/logging.h" #include "core/common/status.h" -#include "core/platform/ort_mutex.h" +#include #if defined(__OBJC__) @class MLMultiArray; @@ -73,7 +73,7 @@ class Model { } // Mutex for exclusive lock to this model object - OrtMutex& GetMutex() { return mutex_; } + std::mutex& GetMutex() { return mutex_; } // Input and output names in the ORT fused node's order. // Names may have been adjusted from the originals due to CoreML naming rules. @@ -101,7 +101,7 @@ class Model { std::unordered_set scalar_outputs_; std::unordered_set int64_outputs_; - OrtMutex mutex_; + std::mutex mutex_; }; } // namespace coreml diff --git a/onnxruntime/core/providers/cpu/cpu_execution_provider.cc b/onnxruntime/core/providers/cpu/cpu_execution_provider.cc index f880a39188a06..d57c33ae965b1 100644 --- a/onnxruntime/core/providers/cpu/cpu_execution_provider.cc +++ b/onnxruntime/core/providers/cpu/cpu_execution_provider.cc @@ -374,8 +374,10 @@ class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOn QuantizeLinear); class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 10, 12, int8_t, QuantizeLinear); -class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 10, uint8_t, QLinearMatMul); -class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 10, int8_t, QLinearMatMul); +class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 10, 20, uint8_t, + QLinearMatMul); +class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 10, 20, int8_t, + QLinearMatMul); class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 10, uint8_t, MatMulInteger); class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 10, int8_t, MatMulInteger); class ONNX_OPERATOR_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 10, ConvInteger); @@ -1103,6 +1105,8 @@ class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 21, int16_t, DequantizeLinear); class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 21, Int4x2, DequantizeLinear); class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 21, UInt4x2, DequantizeLinear); +class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 21, uint8_t, QLinearMatMul); +class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 21, int8_t, QLinearMatMul); #if !defined(DISABLE_FLOAT8_TYPES) class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 21, Float8E4M3FN, DequantizeLinear); class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCpuExecutionProvider, kOnnxDomain, 21, Float8E4M3FNUZ, DequantizeLinear); @@ -1686,10 +1690,10 @@ Status RegisterOnnxOperatorKernels(KernelRegistry& kernel_registry) { uint8_t, QuantizeLinear)>, BuildKernelCreateInfo, - BuildKernelCreateInfo, - BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, #if !defined(DISABLE_FLOAT8_TYPES) BuildKernelCreateInfo, diff --git a/onnxruntime/core/providers/cpu/generator/random.cc b/onnxruntime/core/providers/cpu/generator/random.cc index dfa27f1f44d5a..091b01b81b5b1 100644 --- a/onnxruntime/core/providers/cpu/generator/random.cc +++ b/onnxruntime/core/providers/cpu/generator/random.cc @@ -138,7 +138,7 @@ static TensorProto::DataType InferDataType(const Tensor& tensor); Status RandomNormal::Compute(OpKernelContext* ctx) const { Tensor& Y = *ctx->Output(0, shape_); - std::lock_guard l(generator_mutex_); + std::lock_guard l(generator_mutex_); auto status = RandomNormalCompute(mean_, scale_, generator_, dtype_, Y); return status; @@ -147,7 +147,7 @@ Status RandomNormal::Compute(OpKernelContext* ctx) const { Status RandomUniform::Compute(OpKernelContext* ctx) const { Tensor& Y = *ctx->Output(0, shape_); - std::lock_guard l(generator_mutex_); + std::lock_guard l(generator_mutex_); auto status = RandomUniformCompute(low_, high_, generator_, dtype_, Y); return status; @@ -169,7 +169,7 @@ Status RandomNormalLike::Compute(OpKernelContext* ctx) const { "Could not infer data type from input tensor with data type ", X.DataType()); - std::lock_guard l(generator_mutex_); + std::lock_guard l(generator_mutex_); status = RandomNormalCompute(mean_, scale_, generator_, dtype, *Y); return status; @@ -190,7 +190,7 @@ Status RandomUniformLike::Compute(OpKernelContext* ctx) const { return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Could not infer data type from input tensor with data type ", X.DataType()); - std::lock_guard l(generator_mutex_); + std::lock_guard l(generator_mutex_); status = RandomUniformCompute(low_, high_, generator_, dtype, *Y); return status; @@ -310,7 +310,7 @@ Status Multinomial::Compute(OpKernelContext* ctx) const { Tensor* Y = ctx->Output(0, {batch_size, num_samples_}); Status status = Status::OK(); - std::lock_guard l(generator_mutex_); + std::lock_guard l(generator_mutex_); switch (output_dtype_) { case TensorProto::INT32: { status = MultinomialCompute(ctx, X, batch_size, num_classes, num_samples_, generator_, *Y); diff --git a/onnxruntime/core/providers/cpu/generator/random.h b/onnxruntime/core/providers/cpu/generator/random.h index 8a0390fe7af8c..1cfb276052f85 100644 --- a/onnxruntime/core/providers/cpu/generator/random.h +++ b/onnxruntime/core/providers/cpu/generator/random.h @@ -9,7 +9,7 @@ #include "core/common/common.h" #include "core/framework/op_kernel.h" #include "core/framework/random_seed.h" -#include "core/platform/ort_mutex.h" +#include namespace onnxruntime { @@ -58,7 +58,7 @@ class RandomNormal final : public OpKernel { // use generator_mutex_ to ensure Compute() can be called concurrently. // this is to ensure that a model with random generators is deterministic and still can be executed in parallel. mutable std::default_random_engine generator_; - mutable onnxruntime::OrtMutex generator_mutex_; + mutable std::mutex generator_mutex_; ONNX_NAMESPACE::TensorProto::DataType dtype_; TensorShape shape_; }; @@ -94,7 +94,7 @@ class RandomNormalLike final : public OpKernel { // see comments for generator_ and generator_mutex_ in RandomNormal class. mutable std::default_random_engine generator_; - mutable onnxruntime::OrtMutex generator_mutex_; + mutable std::mutex generator_mutex_; ONNX_NAMESPACE::TensorProto::DataType dtype_ = ONNX_NAMESPACE::TensorProto::DataType::TensorProto_DataType_UNDEFINED; // optional and may be inferred }; @@ -132,7 +132,7 @@ class RandomUniform final : public OpKernel { // see comments for generator_ and generator_mutex_ in RandomNormal class. mutable std::default_random_engine generator_; - mutable onnxruntime::OrtMutex generator_mutex_; + mutable std::mutex generator_mutex_; ONNX_NAMESPACE::TensorProto::DataType dtype_; TensorShape shape_; }; @@ -167,7 +167,7 @@ class RandomUniformLike final : public OpKernel { // see comments for generator_ and generator_mutex_ in RandomNormal class. mutable std::default_random_engine generator_; - mutable onnxruntime::OrtMutex generator_mutex_; + mutable std::mutex generator_mutex_; ONNX_NAMESPACE::TensorProto::DataType dtype_ = ONNX_NAMESPACE::TensorProto::DataType::TensorProto_DataType_UNDEFINED; // optional and may be inferred }; @@ -201,7 +201,7 @@ class Multinomial final : public OpKernel { // see comments for generator_ and generator_mutex_ in RandomNormal class. mutable std::default_random_engine generator_; - mutable onnxruntime::OrtMutex generator_mutex_; + mutable std::mutex generator_mutex_; ONNX_NAMESPACE::TensorProto::DataType output_dtype_; }; } // namespace onnxruntime diff --git a/onnxruntime/core/providers/cpu/ml/tree_ensemble_common.h b/onnxruntime/core/providers/cpu/ml/tree_ensemble_common.h index df27f888bb0af..94f79518ae8da 100644 --- a/onnxruntime/core/providers/cpu/ml/tree_ensemble_common.h +++ b/onnxruntime/core/providers/cpu/ml/tree_ensemble_common.h @@ -4,7 +4,7 @@ #pragma once #include "tree_ensemble_aggregator.h" -#include "core/platform/ort_mutex.h" +#include #include "core/platform/threadpool.h" #include "tree_ensemble_helper.h" diff --git a/onnxruntime/core/providers/cpu/quantization/quantize_linear_matmul.cc b/onnxruntime/core/providers/cpu/quantization/quantize_linear_matmul.cc index cb162ade44559..be448455194f6 100644 --- a/onnxruntime/core/providers/cpu/quantization/quantize_linear_matmul.cc +++ b/onnxruntime/core/providers/cpu/quantization/quantize_linear_matmul.cc @@ -14,10 +14,11 @@ namespace onnxruntime { // uint8_t kernel supports weight being either uint8_t or int8_t -ONNX_OPERATOR_TYPED_KERNEL_EX( +ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_EX( QLinearMatMul, kOnnxDomain, 10, + 20, uint8_t, kCpuExecutionProvider, KernelDefBuilder() @@ -26,21 +27,45 @@ ONNX_OPERATOR_TYPED_KERNEL_EX( .TypeConstraint("T3", DataTypeImpl::GetTensorType()), QLinearMatMul); +ONNX_OPERATOR_TYPED_KERNEL_EX( + QLinearMatMul, + kOnnxDomain, + 21, + uint8_t, + kCpuExecutionProvider, + KernelDefBuilder() + .TypeConstraint("TS", DataTypeImpl::GetTensorType()) + .TypeConstraint("T1", DataTypeImpl::GetTensorType()) + .TypeConstraint("T2", {DataTypeImpl::GetTensorType(), DataTypeImpl::GetTensorType()}) + .TypeConstraint("T3", DataTypeImpl::GetTensorType()), + QLinearMatMul); + // int8_t kernel only supports weight being int8_t -#define REGISTER_QLINEARMATMUL_INT8_KERNEL() \ - ONNX_OPERATOR_TYPED_KERNEL_EX( \ - QLinearMatMul, \ - kOnnxDomain, \ - 10, \ - int8_t, \ - kCpuExecutionProvider, \ - KernelDefBuilder() \ - .TypeConstraint("T1", DataTypeImpl::GetTensorType()) \ - .TypeConstraint("T2", DataTypeImpl::GetTensorType()) \ - .TypeConstraint("T3", DataTypeImpl::GetTensorType()), \ - QLinearMatMul); - -REGISTER_QLINEARMATMUL_INT8_KERNEL(); +ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_EX( + QLinearMatMul, + kOnnxDomain, + 10, + 20, + int8_t, + kCpuExecutionProvider, + KernelDefBuilder() + .TypeConstraint("T1", DataTypeImpl::GetTensorType()) + .TypeConstraint("T2", DataTypeImpl::GetTensorType()) + .TypeConstraint("T3", DataTypeImpl::GetTensorType()), + QLinearMatMul); + +ONNX_OPERATOR_TYPED_KERNEL_EX( + QLinearMatMul, + kOnnxDomain, + 21, + int8_t, + kCpuExecutionProvider, + KernelDefBuilder() + .TypeConstraint("TS", DataTypeImpl::GetTensorType()) + .TypeConstraint("T1", DataTypeImpl::GetTensorType()) + .TypeConstraint("T2", DataTypeImpl::GetTensorType()) + .TypeConstraint("T3", DataTypeImpl::GetTensorType()), + QLinearMatMul); Status QLinearMatMul::Compute(OpKernelContext* ctx) const { const auto* a = ctx->Input(IN_A); diff --git a/onnxruntime/core/providers/cpu/text/string_normalizer.cc b/onnxruntime/core/providers/cpu/text/string_normalizer.cc index 32de3105d627d..9bc671f68f19a 100644 --- a/onnxruntime/core/providers/cpu/text/string_normalizer.cc +++ b/onnxruntime/core/providers/cpu/text/string_normalizer.cc @@ -8,6 +8,7 @@ #include "onnxruntime_config.h" #ifdef _MSC_VER +#include #include #endif // _MSC_VER diff --git a/onnxruntime/core/providers/cuda/cuda_allocator.cc b/onnxruntime/core/providers/cuda/cuda_allocator.cc index 2189af8e0ee2d..8c96d8f57a0ba 100644 --- a/onnxruntime/core/providers/cuda/cuda_allocator.cc +++ b/onnxruntime/core/providers/cuda/cuda_allocator.cc @@ -69,7 +69,7 @@ void* CUDAExternalAllocator::Alloc(size_t size) { void CUDAExternalAllocator::Free(void* p) { free_(p); - std::lock_guard lock(lock_); + std::lock_guard lock(lock_); auto it = reserved_.find(p); if (it != reserved_.end()) { reserved_.erase(it); @@ -80,7 +80,7 @@ void CUDAExternalAllocator::Free(void* p) { void* CUDAExternalAllocator::Reserve(size_t size) { void* p = Alloc(size); if (!p) return nullptr; - std::lock_guard lock(lock_); + std::lock_guard lock(lock_); ORT_ENFORCE(reserved_.find(p) == reserved_.end()); reserved_.insert(p); return p; diff --git a/onnxruntime/core/providers/cuda/cuda_allocator.h b/onnxruntime/core/providers/cuda/cuda_allocator.h index 86d0d8007bbd8..2d94e2b1cda89 100644 --- a/onnxruntime/core/providers/cuda/cuda_allocator.h +++ b/onnxruntime/core/providers/cuda/cuda_allocator.h @@ -5,7 +5,7 @@ #include "core/common/inlined_containers.h" #include "core/framework/allocator.h" -#include "core/platform/ort_mutex.h" +#include namespace onnxruntime { @@ -42,7 +42,7 @@ class CUDAExternalAllocator : public CUDAAllocator { void* Reserve(size_t size) override; private: - mutable OrtMutex lock_; + mutable std::mutex lock_; ExternalAlloc alloc_; ExternalFree free_; ExternalEmptyCache empty_cache_; diff --git a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc index 82b29c7b0562e..d3f01c1f7adc1 100644 --- a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc +++ b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc @@ -324,7 +324,7 @@ DataLayout CUDAExecutionProvider::GetPreferredLayout() const { CUDAExecutionProvider::~CUDAExecutionProvider() { // clean up thread local context caches { - std::lock_guard lock(context_state_.mutex); + std::lock_guard lock(context_state_.mutex); for (const auto& cache_weak : context_state_.caches_to_update_on_destruction) { const auto cache = cache_weak.lock(); if (!cache) continue; @@ -369,7 +369,7 @@ CUDAExecutionProvider::PerThreadContext& CUDAExecutionProvider::GetPerThreadCont // get context and update cache std::shared_ptr context; { - std::lock_guard lock(context_state_.mutex); + std::lock_guard lock(context_state_.mutex); // get or create a context if (context_state_.retired_context_pool.empty()) { @@ -406,7 +406,7 @@ void CUDAExecutionProvider::ReleasePerThreadContext() const { ORT_ENFORCE(cached_context); { - std::lock_guard lock(context_state_.mutex); + std::lock_guard lock(context_state_.mutex); context_state_.active_contexts.erase(cached_context); context_state_.retired_context_pool.push_back(cached_context); } diff --git a/onnxruntime/core/providers/cuda/cuda_execution_provider.h b/onnxruntime/core/providers/cuda/cuda_execution_provider.h index c5736733beb1d..bd2be2eac2181 100644 --- a/onnxruntime/core/providers/cuda/cuda_execution_provider.h +++ b/onnxruntime/core/providers/cuda/cuda_execution_provider.h @@ -9,7 +9,7 @@ #include "core/framework/arena_extend_strategy.h" #include "core/framework/execution_provider.h" -#include "core/platform/ort_mutex.h" +#include #include "core/providers/cuda/cuda_execution_provider_info.h" #include "core/providers/cuda/cuda_graph.h" #include "core/providers/cuda/cuda_pch.h" @@ -251,7 +251,7 @@ class CUDAExecutionProvider : public IExecutionProvider { std::set, std::owner_less>> caches_to_update_on_destruction; // synchronizes access to PerThreadContextState members - OrtMutex mutex; + std::mutex mutex; }; // The execution provider maintains the PerThreadContexts in this structure. diff --git a/onnxruntime/core/providers/cuda/cuda_graph.h b/onnxruntime/core/providers/cuda/cuda_graph.h index dd03db94b631c..064b526e604bc 100644 --- a/onnxruntime/core/providers/cuda/cuda_graph.h +++ b/onnxruntime/core/providers/cuda/cuda_graph.h @@ -6,7 +6,7 @@ #include #include "core/common/common.h" -#include "core/platform/ort_mutex.h" +#include #include "core/providers/cuda/cuda_pch.h" namespace onnxruntime { diff --git a/onnxruntime/core/providers/cuda/cuda_kernel.h b/onnxruntime/core/providers/cuda/cuda_kernel.h index 9d37a9775872f..054dd9f9da9f3 100644 --- a/onnxruntime/core/providers/cuda/cuda_kernel.h +++ b/onnxruntime/core/providers/cuda/cuda_kernel.h @@ -6,7 +6,7 @@ #include "core/providers/cuda/cuda_common.h" #include "core/providers/cuda/cuda_execution_provider.h" #include "core/providers/cuda/cuda_fwd.h" -#include "core/platform/ort_mutex.h" +#include #include "core/providers/cuda/cuda_stream_handle.h" namespace onnxruntime { diff --git a/onnxruntime/core/providers/cuda/nn/conv.cc b/onnxruntime/core/providers/cuda/nn/conv.cc index cc76198dc3ae9..3129f519da2e5 100644 --- a/onnxruntime/core/providers/cuda/nn/conv.cc +++ b/onnxruntime/core/providers/cuda/nn/conv.cc @@ -457,7 +457,7 @@ Status Conv::UpdateState(OpKernelContext* context, bool bias_expected template Status Conv::ComputeInternal(OpKernelContext* context) const { - std::lock_guard lock(s_.mutex); + std::lock_guard lock(s_.mutex); ORT_RETURN_IF_ERROR(UpdateState(context)); if (s_.Y->Shape().Size() == 0) { return Status::OK(); diff --git a/onnxruntime/core/providers/cuda/nn/conv.h b/onnxruntime/core/providers/cuda/nn/conv.h index 484d66081018b..e4047a6af272e 100644 --- a/onnxruntime/core/providers/cuda/nn/conv.h +++ b/onnxruntime/core/providers/cuda/nn/conv.h @@ -13,7 +13,7 @@ #include #endif -#include "core/platform/ort_mutex.h" +#include #include "core/providers/cuda/cuda_kernel.h" #include "core/providers/cuda/cudnn_common.h" #include "core/providers/cpu/nn/conv_attributes.h" @@ -190,7 +190,7 @@ struct CudnnConvState { TensorShapeVector slice_axes; // note that conv objects are shared between execution frames, and a lock is needed to avoid multi-thread racing - OrtMutex mutex; + std::mutex mutex; IAllocatorUniquePtr memory_for_cudnn_conv_results; ~CudnnConvState() { diff --git a/onnxruntime/core/providers/cuda/nn/conv_8.h b/onnxruntime/core/providers/cuda/nn/conv_8.h index 10239d09041fe..bcee1bcb7e231 100644 --- a/onnxruntime/core/providers/cuda/nn/conv_8.h +++ b/onnxruntime/core/providers/cuda/nn/conv_8.h @@ -387,7 +387,7 @@ Status Conv::UpdateState(OpKernelContext* context, bool bias_expected) template Status Conv::ComputeInternal(OpKernelContext* context) const { - std::lock_guard lock(s_.mutex); + std::lock_guard lock(s_.mutex); ORT_RETURN_IF_ERROR(UpdateState(context)); if (s_.Y->Shape().Size() == 0) { return Status::OK(); diff --git a/onnxruntime/core/providers/cuda/nn/conv_transpose.cc b/onnxruntime/core/providers/cuda/nn/conv_transpose.cc index d4876e1714861..2972ae999adc4 100644 --- a/onnxruntime/core/providers/cuda/nn/conv_transpose.cc +++ b/onnxruntime/core/providers/cuda/nn/conv_transpose.cc @@ -450,7 +450,7 @@ Status ConvTranspose::UpdateState(OpKernelContext* context, bool dyna template Status ConvTranspose::DoConvTranspose(OpKernelContext* context, bool dynamic_padding) const { - std::lock_guard lock(s_.mutex); + std::lock_guard lock(s_.mutex); ORT_RETURN_IF_ERROR(UpdateState(context, dynamic_padding)); if (s_.Y->Shape().Size() == 0) { return Status::OK(); diff --git a/onnxruntime/core/providers/cuda/nn/conv_transpose_8.h b/onnxruntime/core/providers/cuda/nn/conv_transpose_8.h index b46d41b887e41..aa1fe26ac97db 100644 --- a/onnxruntime/core/providers/cuda/nn/conv_transpose_8.h +++ b/onnxruntime/core/providers/cuda/nn/conv_transpose_8.h @@ -87,7 +87,7 @@ Status ConvTranspose::DoConvTranspose(OpKernelContext* context, bool dy } { - std::lock_guard lock(s_.mutex); + std::lock_guard lock(s_.mutex); // CUDNN_CONFIG_RETURN_IF_ERROR(cudnnSetStream(CudnnHandle(), Stream(context))); // TODO: add a global cache if need to handle cases for multiple frames running simultaneously with // different batch_size diff --git a/onnxruntime/core/providers/cuda/nvtx_profile_context.h b/onnxruntime/core/providers/cuda/nvtx_profile_context.h index e2e3be07bd474..eb28f86becd20 100644 --- a/onnxruntime/core/providers/cuda/nvtx_profile_context.h +++ b/onnxruntime/core/providers/cuda/nvtx_profile_context.h @@ -7,7 +7,7 @@ #include #include -#include "core/platform/ort_mutex.h" +#include #ifdef ENABLE_NVTX_PROFILE @@ -25,14 +25,14 @@ class Context { // Return tag for the specified thread. // If the thread's tag doesn't exist, this function returns an empty string. std::string GetThreadTagOrDefault(const std::thread::id& thread_id) { - const std::lock_guard lock(mtx_); + const std::lock_guard lock(mtx_); return thread_tag_[thread_id]; } // Set tag for the specified thread. void SetThreadTag( const std::thread::id& thread_id, const std::string& tag) { - const std::lock_guard lock(mtx_); + const std::lock_guard lock(mtx_); thread_tag_[thread_id] = tag; } @@ -44,7 +44,7 @@ class Context { // map from thread's id to its human-readable tag. std::unordered_map thread_tag_; - OrtMutex mtx_; + std::mutex mtx_; }; } // namespace profile diff --git a/onnxruntime/core/providers/cuda/tensor/nonzero_impl.cu b/onnxruntime/core/providers/cuda/tensor/nonzero_impl.cu index 0dcc188d039a9..ce5a1ebf3faa5 100644 --- a/onnxruntime/core/providers/cuda/tensor/nonzero_impl.cu +++ b/onnxruntime/core/providers/cuda/tensor/nonzero_impl.cu @@ -2,7 +2,7 @@ // Licensed under the MIT License. #include "nonzero_impl.h" -#include "core/platform/ort_mutex.h" +#include #include "core/providers/cuda/shared_inc/cuda_call.h" #include "core/providers/cuda/cu_inc/common.cuh" #include diff --git a/onnxruntime/core/providers/dnnl/dnnl_execution_provider.cc b/onnxruntime/core/providers/dnnl/dnnl_execution_provider.cc index ffda84921a3ee..c96f9cc1ff400 100644 --- a/onnxruntime/core/providers/dnnl/dnnl_execution_provider.cc +++ b/onnxruntime/core/providers/dnnl/dnnl_execution_provider.cc @@ -12,7 +12,7 @@ #include #endif // defined(DNNL_OPENMP) -#include "core/platform/ort_mutex.h" +#include #include "core/providers/shared_library/provider_api.h" #include "core/providers/dnnl/dnnl_execution_provider.h" @@ -356,7 +356,7 @@ Status DnnlExecutionProvider::Compile(const std::vector& fuse // lock each subgraph_primitive as multiple threads have shared memories { - std::unique_lock lock(subgraph_primitive->GetMutex()); + std::unique_lock lock(subgraph_primitive->GetMutex()); subgraph_primitive->Compile(inputs); std::unordered_map outputs; outputs.reserve(subgraph_num_outputs); diff --git a/onnxruntime/core/providers/dnnl/subgraph/dnnl_subgraph_primitive.h b/onnxruntime/core/providers/dnnl/subgraph/dnnl_subgraph_primitive.h index a7e49b54d4507..3bd12f1cf6f7e 100644 --- a/onnxruntime/core/providers/dnnl/subgraph/dnnl_subgraph_primitive.h +++ b/onnxruntime/core/providers/dnnl/subgraph/dnnl_subgraph_primitive.h @@ -4,7 +4,7 @@ #pragma once #include "dnnl_subgraph.h" #include "dnnl.hpp" -#include "core/platform/ort_mutex.h" +#include namespace onnxruntime { namespace ort_dnnl { @@ -69,7 +69,7 @@ class DnnlSubgraphPrimitive { // If the input being a scalar affects the operator this function can be used to determine if the // original input from ORT was a scalar. bool IsScalar(const DnnlTensor& tensor); - OrtMutex& GetMutex() { return mutex_; } + std::mutex& GetMutex() { return mutex_; } // GetMemory in OrtFormat if the memory is not in the OrtFormat this will reorder the memory. // All memory will be moved to the dnnl_engine even if it is already in OrtFormat. @@ -125,7 +125,7 @@ class DnnlSubgraphPrimitive { dnnl::engine cpu_engine_; dnnl::engine gpu_engine_; - OrtMutex mutex_; + std::mutex mutex_; // for memory debug purpose std::vector> items_to_print_; diff --git a/onnxruntime/core/providers/migraphx/migraphx_allocator.cc b/onnxruntime/core/providers/migraphx/migraphx_allocator.cc index c9db31e8744a7..3d9ae2bf7e6ff 100644 --- a/onnxruntime/core/providers/migraphx/migraphx_allocator.cc +++ b/onnxruntime/core/providers/migraphx/migraphx_allocator.cc @@ -51,7 +51,7 @@ void* MIGraphXExternalAllocator::Alloc(size_t size) { void MIGraphXExternalAllocator::Free(void* p) { free_(p); - std::lock_guard lock(lock_); + std::lock_guard lock(lock_); auto it = reserved_.find(p); if (it != reserved_.end()) { reserved_.erase(it); @@ -62,7 +62,7 @@ void MIGraphXExternalAllocator::Free(void* p) { void* MIGraphXExternalAllocator::Reserve(size_t size) { void* p = Alloc(size); if (!p) return nullptr; - std::lock_guard lock(lock_); + std::lock_guard lock(lock_); ORT_ENFORCE(reserved_.find(p) == reserved_.end()); reserved_.insert(p); return p; diff --git a/onnxruntime/core/providers/migraphx/migraphx_allocator.h b/onnxruntime/core/providers/migraphx/migraphx_allocator.h index 64da844e8c714..c8c935eba44ab 100644 --- a/onnxruntime/core/providers/migraphx/migraphx_allocator.h +++ b/onnxruntime/core/providers/migraphx/migraphx_allocator.h @@ -5,7 +5,7 @@ #include #include "core/framework/allocator.h" -#include "core/platform/ort_mutex.h" +#include namespace onnxruntime { @@ -42,7 +42,7 @@ class MIGraphXExternalAllocator : public MIGraphXAllocator { void* Reserve(size_t size) override; private: - mutable OrtMutex lock_; + mutable std::mutex lock_; ExternalAlloc alloc_; ExternalFree free_; ExternalEmptyCache empty_cache_; diff --git a/onnxruntime/core/providers/migraphx/migraphx_execution_provider.cc b/onnxruntime/core/providers/migraphx/migraphx_execution_provider.cc index 6fc729a537bc5..e41cd577b0b21 100644 --- a/onnxruntime/core/providers/migraphx/migraphx_execution_provider.cc +++ b/onnxruntime/core/providers/migraphx/migraphx_execution_provider.cc @@ -835,6 +835,7 @@ GetUnsupportedNodeIndices(const GraphViewer& graph_viewer, "GlobalMaxPool", "Greater", "GreaterOrEqual", + "GroupNormalization", "GroupQueryAttention", "HardSigmoid", "HardSwish", @@ -843,6 +844,7 @@ GetUnsupportedNodeIndices(const GraphViewer& graph_viewer, "ImageScaler", "InstanceNormalization", "IsNan", + "LayerNormalization", "LeakyRelu", "Less", "LessOrEqual", @@ -1425,7 +1427,7 @@ Status MIGraphXExecutionProvider::Compile(const std::vector& { // lock to avoid race condition - std::lock_guard lock(*(mgx_state->mgx_mu_ptr)); + std::lock_guard lock(*(mgx_state->mgx_mu_ptr)); void* rocm_stream; Ort::ThrowOnError(api->KernelContext_GetGPUComputeStream(context, &rocm_stream)); diff --git a/onnxruntime/core/providers/migraphx/migraphx_execution_provider.h b/onnxruntime/core/providers/migraphx/migraphx_execution_provider.h index 21679d1f6f151..91b6a4741b55e 100644 --- a/onnxruntime/core/providers/migraphx/migraphx_execution_provider.h +++ b/onnxruntime/core/providers/migraphx/migraphx_execution_provider.h @@ -5,7 +5,7 @@ #include "core/framework/arena_extend_strategy.h" #include "core/framework/execution_provider.h" -#include "core/platform/ort_mutex.h" +#include #include "core/providers/migraphx/migraphx_execution_provider_info.h" #include "core/providers/migraphx/migraphx_inc.h" @@ -40,7 +40,7 @@ struct MIGraphXFuncState { migraphx::onnx_options options; migraphx::target t{}; std::unordered_map input_name_indexes; - OrtMutex* mgx_mu_ptr = nullptr; + std::mutex* mgx_mu_ptr = nullptr; bool no_input_shape = false; bool fp16_enable = false; bool int8_enable = false; @@ -101,7 +101,7 @@ class MIGraphXExecutionProvider : public IExecutionProvider { std::string load_compiled_path_; bool dump_model_ops_ = false; migraphx::target t_; - OrtMutex mgx_mu_; + std::mutex mgx_mu_; hipStream_t stream_ = nullptr; bool exhaustive_tune_ = false; mutable std::filesystem::path model_path_; diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/model.h b/onnxruntime/core/providers/nnapi/nnapi_builtin/model.h index 3ff28d52e470f..643209fbe72b0 100644 --- a/onnxruntime/core/providers/nnapi/nnapi_builtin/model.h +++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/model.h @@ -6,7 +6,7 @@ #include #include "builders/shaper.h" -#include "core/platform/ort_mutex.h" +#include #include "nnapi_lib/NeuralNetworksWrapper.h" struct NnApi; @@ -98,7 +98,7 @@ class Model { void SetDynamicOutputBufferSize(size_t size) { dynamic_output_buffer_size_ = size; } // Mutex for exclusive lock to this model object - OrtMutex& GetMutex() { return mutex_; } + std::mutex& GetMutex() { return mutex_; } // If the given output is a scalar output // Since NNAPI does not support tensor with empty shape (scalar), we use {1} tensor for scalar in NNAPI @@ -130,7 +130,7 @@ class Model { // This is map is to lookup the nnapi output from the onnx output std::unordered_map onnx_to_nnapi_output_map_; - OrtMutex mutex_; + std::mutex mutex_; void AddInput(const std::string& name, const android::nn::wrapper::OperandType& operand_type); diff --git a/onnxruntime/core/providers/nnapi/nnapi_builtin/nnapi_execution_provider.cc b/onnxruntime/core/providers/nnapi/nnapi_builtin/nnapi_execution_provider.cc index 4d2888222ff0f..fca52396a190c 100644 --- a/onnxruntime/core/providers/nnapi/nnapi_builtin/nnapi_execution_provider.cc +++ b/onnxruntime/core/providers/nnapi/nnapi_builtin/nnapi_execution_provider.cc @@ -380,7 +380,7 @@ common::Status NnapiExecutionProvider::Compile(const std::vector execution; - std::unique_lock lock(model->GetMutex()); + std::unique_lock lock(model->GetMutex()); ORT_RETURN_IF_ERROR(model->PrepareForExecution(execution)); ORT_RETURN_IF_ERROR(execution->SetInputBuffers(inputs)); diff --git a/onnxruntime/core/providers/qnn/builder/qnn_model.cc b/onnxruntime/core/providers/qnn/builder/qnn_model.cc index b09ff51b666c7..dc797fef2d42a 100644 --- a/onnxruntime/core/providers/qnn/builder/qnn_model.cc +++ b/onnxruntime/core/providers/qnn/builder/qnn_model.cc @@ -247,7 +247,7 @@ Status QnnModel::ExecuteGraph(const Ort::KernelContext& context, const logging:: { // Acquire mutex before calling graphExecute and profiling APIs to support calling session.Run() // from multiple threads. - std::lock_guard lock(graph_exec_mutex_); + std::lock_guard lock(graph_exec_mutex_); execute_status = qnn_interface.graphExecute(graph_info_->Graph(), qnn_inputs.data(), static_cast(qnn_inputs.size()), diff --git a/onnxruntime/core/providers/qnn/builder/qnn_model.h b/onnxruntime/core/providers/qnn/builder/qnn_model.h index d9682cc3b3222..2e0935391ca78 100644 --- a/onnxruntime/core/providers/qnn/builder/qnn_model.h +++ b/onnxruntime/core/providers/qnn/builder/qnn_model.h @@ -8,7 +8,7 @@ #include "core/common/status.h" #include "core/framework/node_unit.h" #include "core/graph/graph_viewer.h" -#include "core/platform/ort_mutex.h" +#include #include "core/providers/qnn/builder/qnn_def.h" #include "core/providers/qnn/builder/qnn_model_wrapper.h" #include "core/providers/qnn/builder/qnn_backend_manager.h" @@ -143,7 +143,7 @@ class QnnModel { QnnBackendType qnn_backend_type_ = QnnBackendType::CPU; // Mutex acquired during graph execution to support multi-threaded inference of a single session. - OrtMutex graph_exec_mutex_; + std::mutex graph_exec_mutex_; }; } // namespace qnn diff --git a/onnxruntime/core/providers/qnn/qnn_execution_provider.cc b/onnxruntime/core/providers/qnn/qnn_execution_provider.cc index 4cd5d403e95b8..becb9a728b1e3 100644 --- a/onnxruntime/core/providers/qnn/qnn_execution_provider.cc +++ b/onnxruntime/core/providers/qnn/qnn_execution_provider.cc @@ -36,8 +36,8 @@ constexpr const char* QNN = "QNN"; static std::unique_ptr>> s_run_on_unload_; void RunOnUnload(std::function function) { - static OrtMutex mutex; - std::lock_guard guard(mutex); + static std::mutex mutex; + std::lock_guard guard(mutex); if (!s_run_on_unload_) { s_run_on_unload_ = std::make_unique>>(); } @@ -444,7 +444,7 @@ QNNExecutionProvider::QNNExecutionProvider(const ProviderOptions& provider_optio QNNExecutionProvider::~QNNExecutionProvider() { // clean up thread local context caches - std::lock_guard lock(context_state_.mutex); + std::lock_guard lock(context_state_.mutex); for (const auto& cache_weak : context_state_.caches_to_update_on_destruction) { const auto cache = cache_weak.lock(); if (!cache) continue; @@ -1050,7 +1050,7 @@ QNNExecutionProvider::PerThreadContext& QNNExecutionProvider::GetPerThreadContex // get context and update cache std::shared_ptr context; { - std::lock_guard lock(context_state_.mutex); + std::lock_guard lock(context_state_.mutex); // get or create a context if (context_state_.retired_context_pool.empty()) { @@ -1084,7 +1084,7 @@ void QNNExecutionProvider::ReleasePerThreadContext() const { ORT_ENFORCE(cached_context); { - std::lock_guard lock(context_state_.mutex); + std::lock_guard lock(context_state_.mutex); context_state_.active_contexts.erase(cached_context); context_state_.retired_context_pool.push_back(cached_context); } diff --git a/onnxruntime/core/providers/qnn/qnn_execution_provider.h b/onnxruntime/core/providers/qnn/qnn_execution_provider.h index 246ab1d5a6608..30e2fd53e9613 100644 --- a/onnxruntime/core/providers/qnn/qnn_execution_provider.h +++ b/onnxruntime/core/providers/qnn/qnn_execution_provider.h @@ -31,7 +31,7 @@ class SharedContext { } bool HasSharedQnnModels() { - const std::lock_guard lock(mtx_); + const std::lock_guard lock(mtx_); return !shared_qnn_models_.empty(); } @@ -42,7 +42,7 @@ class SharedContext { } std::unique_ptr GetSharedQnnModel(const std::string& model_name) { - const std::lock_guard lock(mtx_); + const std::lock_guard lock(mtx_); auto it = find_if(shared_qnn_models_.begin(), shared_qnn_models_.end(), [&model_name](const std::unique_ptr& qnn_model) { return qnn_model->Name() == model_name; }); if (it == shared_qnn_models_.end()) { @@ -55,7 +55,7 @@ class SharedContext { bool SetSharedQnnModel(std::vector>&& shared_qnn_models, std::string& duplicate_graph_names) { - const std::lock_guard lock(mtx_); + const std::lock_guard lock(mtx_); bool graph_exist = false; for (auto& shared_qnn_model : shared_qnn_models) { auto& model_name = shared_qnn_model->Name(); @@ -81,7 +81,7 @@ class SharedContext { std::vector> shared_qnn_models_; // Producer sessions can be in parallel // Consumer sessions have to be after producer sessions initialized - OrtMutex mtx_; + std::mutex mtx_; }; // Logical device representation. @@ -202,7 +202,7 @@ class QNNExecutionProvider : public IExecutionProvider { std::set, std::owner_less>> caches_to_update_on_destruction; // synchronizes access to PerThreadContextState members - OrtMutex mutex; + std::mutex mutex; }; // The execution provider maintains the PerThreadContexts in this structure. diff --git a/onnxruntime/core/providers/rocm/nn/conv.cc b/onnxruntime/core/providers/rocm/nn/conv.cc index d7f47d07a8fec..f99885634b6c7 100644 --- a/onnxruntime/core/providers/rocm/nn/conv.cc +++ b/onnxruntime/core/providers/rocm/nn/conv.cc @@ -324,7 +324,7 @@ Status Conv::UpdateState(OpKernelContext* context, bool bias_expected) template Status Conv::ComputeInternal(OpKernelContext* context) const { - std::lock_guard lock(s_.mutex); + std::lock_guard lock(s_.mutex); ORT_RETURN_IF_ERROR(UpdateState(context)); if (s_.Y->Shape().Size() == 0) { return Status::OK(); diff --git a/onnxruntime/core/providers/rocm/nn/conv.h b/onnxruntime/core/providers/rocm/nn/conv.h index bc9846203e57d..e6ebb5a380d3f 100644 --- a/onnxruntime/core/providers/rocm/nn/conv.h +++ b/onnxruntime/core/providers/rocm/nn/conv.h @@ -3,7 +3,7 @@ #pragma once -#include "core/platform/ort_mutex.h" +#include #include "core/providers/rocm/rocm_kernel.h" #include "core/providers/rocm/miopen_common.h" #include "core/providers/cpu/nn/conv_attributes.h" @@ -158,7 +158,7 @@ struct MiopenConvState { TensorShapeVector slice_axes; // note that conv objects are shared between execution frames, and a lock is needed to avoid multi-thread racing - OrtMutex mutex; + std::mutex mutex; IAllocatorUniquePtr memory_for_miopen_conv_results; ~MiopenConvState() { diff --git a/onnxruntime/core/providers/rocm/nn/conv_transpose.cc b/onnxruntime/core/providers/rocm/nn/conv_transpose.cc index 7447113fdf847..a6848e90b406d 100644 --- a/onnxruntime/core/providers/rocm/nn/conv_transpose.cc +++ b/onnxruntime/core/providers/rocm/nn/conv_transpose.cc @@ -66,7 +66,7 @@ Status ConvTranspose::DoConvTranspose(OpKernelContext* context, bool dy } { - std::lock_guard lock(s_.mutex); + std::lock_guard lock(s_.mutex); // TODO: add a global cache if need to handle cases for multiple frames running simultaneously with different batch_size bool input_dims_changed = (s_.last_x_dims.AsShapeVector() != x_dims); bool w_dims_changed = (s_.last_w_dims.AsShapeVector() != w_dims); diff --git a/onnxruntime/core/providers/rocm/rocm_allocator.cc b/onnxruntime/core/providers/rocm/rocm_allocator.cc index 4a11b158c2cce..27861a567a7f4 100644 --- a/onnxruntime/core/providers/rocm/rocm_allocator.cc +++ b/onnxruntime/core/providers/rocm/rocm_allocator.cc @@ -69,7 +69,7 @@ void* ROCMExternalAllocator::Alloc(size_t size) { void ROCMExternalAllocator::Free(void* p) { free_(p); - std::lock_guard lock(lock_); + std::lock_guard lock(lock_); auto it = reserved_.find(p); if (it != reserved_.end()) { reserved_.erase(it); @@ -80,7 +80,7 @@ void ROCMExternalAllocator::Free(void* p) { void* ROCMExternalAllocator::Reserve(size_t size) { void* p = Alloc(size); if (!p) return nullptr; - std::lock_guard lock(lock_); + std::lock_guard lock(lock_); ORT_ENFORCE(reserved_.find(p) == reserved_.end()); reserved_.insert(p); return p; diff --git a/onnxruntime/core/providers/rocm/rocm_allocator.h b/onnxruntime/core/providers/rocm/rocm_allocator.h index 04de09ab9c00b..ef13fc2e25cda 100644 --- a/onnxruntime/core/providers/rocm/rocm_allocator.h +++ b/onnxruntime/core/providers/rocm/rocm_allocator.h @@ -5,7 +5,7 @@ #include "core/common/inlined_containers.h" #include "core/framework/allocator.h" -#include "core/platform/ort_mutex.h" +#include namespace onnxruntime { @@ -42,7 +42,7 @@ class ROCMExternalAllocator : public ROCMAllocator { void* Reserve(size_t size) override; private: - mutable OrtMutex lock_; + mutable std::mutex lock_; ExternalAlloc alloc_; ExternalFree free_; ExternalEmptyCache empty_cache_; diff --git a/onnxruntime/core/providers/rocm/rocm_execution_provider.cc b/onnxruntime/core/providers/rocm/rocm_execution_provider.cc index f36b5e01dbbd3..02a21c033e988 100644 --- a/onnxruntime/core/providers/rocm/rocm_execution_provider.cc +++ b/onnxruntime/core/providers/rocm/rocm_execution_provider.cc @@ -302,7 +302,7 @@ ROCMExecutionProvider::ROCMExecutionProvider(const ROCMExecutionProviderInfo& in ROCMExecutionProvider::~ROCMExecutionProvider() { // clean up thread local context caches { - std::lock_guard lock(context_state_.mutex); + std::lock_guard lock(context_state_.mutex); for (const auto& cache_weak : context_state_.caches_to_update_on_destruction) { const auto cache = cache_weak.lock(); if (!cache) continue; @@ -337,7 +337,7 @@ ROCMExecutionProvider::PerThreadContext& ROCMExecutionProvider::GetPerThreadCont // get context and update cache std::shared_ptr context; { - std::lock_guard lock(context_state_.mutex); + std::lock_guard lock(context_state_.mutex); // get or create a context if (context_state_.retired_context_pool.empty()) { @@ -370,7 +370,7 @@ void ROCMExecutionProvider::ReleasePerThreadContext() const { ORT_ENFORCE(cached_context); { - std::lock_guard lock(context_state_.mutex); + std::lock_guard lock(context_state_.mutex); context_state_.active_contexts.erase(cached_context); context_state_.retired_context_pool.push_back(cached_context); } diff --git a/onnxruntime/core/providers/rocm/rocm_execution_provider.h b/onnxruntime/core/providers/rocm/rocm_execution_provider.h index 3caff88fe9b30..be467869248ea 100644 --- a/onnxruntime/core/providers/rocm/rocm_execution_provider.h +++ b/onnxruntime/core/providers/rocm/rocm_execution_provider.h @@ -8,7 +8,7 @@ #include "core/framework/arena_extend_strategy.h" #include "core/framework/execution_provider.h" -#include "core/platform/ort_mutex.h" +#include #include "core/providers/rocm/rocm_execution_provider_info.h" #include "core/providers/rocm/rocm_graph.h" #include "core/providers/rocm/rocm_pch.h" @@ -205,7 +205,7 @@ class ROCMExecutionProvider : public IExecutionProvider { std::set, std::owner_less>> caches_to_update_on_destruction; // synchronizes access to PerThreadContextState members - OrtMutex mutex; + std::mutex mutex; }; // The execution provider maintains the PerThreadContexts in this structure. diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc index 97d88786e4bcd..4da40823ba4e9 100644 --- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc +++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc @@ -452,9 +452,9 @@ TensorrtLogger& GetTensorrtLogger(bool verbose_log) { return trt_logger; } -std::unique_lock TensorrtExecutionProvider::GetApiLock() const { - static OrtMutex singleton; - return std::unique_lock(singleton); +std::unique_lock TensorrtExecutionProvider::GetApiLock() const { + static std::mutex singleton; + return std::unique_lock(singleton); } /* @@ -1236,7 +1236,7 @@ void TensorrtExecutionProvider::ReleasePerThreadContext() const { ORT_ENFORCE(cached_context); { - std::lock_guard lock(context_state_.mutex); + std::lock_guard lock(context_state_.mutex); context_state_.active_contexts.erase(cached_context); context_state_.retired_context_pool.push_back(cached_context); } @@ -1258,7 +1258,7 @@ TensorrtExecutionProvider::PerThreadContext& TensorrtExecutionProvider::GetPerTh // get context and update cache std::shared_ptr context; { - std::lock_guard lock(context_state_.mutex); + std::lock_guard lock(context_state_.mutex); // get or create a context if (context_state_.retired_context_pool.empty()) { @@ -1768,7 +1768,7 @@ TensorrtExecutionProvider::TensorrtExecutionProvider(const TensorrtExecutionProv TensorrtExecutionProvider::~TensorrtExecutionProvider() { // clean up thread local context caches { - std::lock_guard lock(context_state_.mutex); + std::lock_guard lock(context_state_.mutex); for (const auto& cache_weak : context_state_.caches_to_update_on_destruction) { const auto cache = cache_weak.lock(); if (!cache) continue; @@ -3430,7 +3430,7 @@ Status TensorrtExecutionProvider::CreateNodeComputeInfoFromGraph(const GraphView // The whole compute_function should be considered the critical section where multiple threads may update kernel function state, access one builder, create/serialize/save engine, // save profile and serialize/save timing cache. Therefore, those operations should be synchronized across different threads when ORT is using multithreading. // More details here, https://docs.nvidia.com/deeplearning/tensorrt/developer-guide/index.html#threading - std::lock_guard lock(*(trt_state->tensorrt_mu_ptr)); + std::lock_guard lock(*(trt_state->tensorrt_mu_ptr)); const std::unordered_map& input_indexes = (trt_state->input_info)[0]; const std::unordered_map& output_indexes = (trt_state->output_info)[0]; const std::unordered_map& output_types = (trt_state->output_info)[1]; @@ -4099,7 +4099,7 @@ Status TensorrtExecutionProvider::CreateNodeComputeInfoFromPrecompiledEngine(con // The whole compute_function should be considered the critical section. // More details here, https://docs.nvidia.com/deeplearning/tensorrt/developer-guide/index.html#threading - std::lock_guard lock(*(trt_state->tensorrt_mu_ptr)); + std::lock_guard lock(*(trt_state->tensorrt_mu_ptr)); const std::unordered_map& input_indexes = (trt_state->input_info)[0]; const std::unordered_map& output_indexes = (trt_state->output_info)[0]; diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h index 97c9367b0bb61..c057d48de4070 100644 --- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h +++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h @@ -12,7 +12,7 @@ typedef void* cudnnStatus_t; #endif #include "core/providers/tensorrt/nv_includes.h" -#include "core/platform/ort_mutex.h" +#include #include "core/providers/cuda/cuda_graph.h" #include "tensorrt_execution_provider_info.h" @@ -169,7 +169,7 @@ struct TensorrtFuncState { std::vector> input_info; std::vector> output_info; std::unordered_map>>> input_shape_ranges; - OrtMutex* tensorrt_mu_ptr = nullptr; + std::mutex* tensorrt_mu_ptr = nullptr; bool fp16_enable = false; bool int8_enable = false; bool int8_calibration_cache_available = false; @@ -214,7 +214,7 @@ struct TensorrtShortFuncState { std::vector> output_info; bool context_memory_sharing_enable = false; size_t* max_context_mem_size_ptr = nullptr; - OrtMutex* tensorrt_mu_ptr = nullptr; + std::mutex* tensorrt_mu_ptr = nullptr; }; // Holds important information for building valid ORT graph. @@ -312,7 +312,7 @@ class TensorrtExecutionProvider : public IExecutionProvider { std::string tactic_sources_; std::string global_cache_path_, cache_path_, engine_decryption_lib_path_; std::unique_ptr runtime_ = nullptr; - OrtMutex tensorrt_mu_; + std::mutex tensorrt_mu_; int device_id_; std::string compute_capability_; bool context_memory_sharing_enable_ = false; @@ -476,7 +476,7 @@ class TensorrtExecutionProvider : public IExecutionProvider { std::set, std::owner_less>> caches_to_update_on_destruction; // synchronizes access to PerThreadContextState members - OrtMutex mutex; + std::mutex mutex; }; // The execution provider maintains the PerThreadContexts in this structure. @@ -509,7 +509,7 @@ class TensorrtExecutionProvider : public IExecutionProvider { Every api call not in the thread-safe operations(https://docs.nvidia.com/deeplearning/tensorrt/developer-guide/index.html#threading) should be protected by a lock when invoked by multiple threads concurrently. */ - std::unique_lock GetApiLock() const; + std::unique_lock GetApiLock() const; /**Check the graph is the subgraph of control flow op*/ bool IsSubGraphOfControlFlowOp(const GraphViewer& graph) const; diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_custom_ops.cc b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_custom_ops.cc index a4d2d6c9d65f3..e93d3565fe33d 100644 --- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_custom_ops.cc +++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_custom_ops.cc @@ -28,8 +28,8 @@ extern TensorrtLogger& GetTensorrtLogger(bool verbose); common::Status CreateTensorRTCustomOpDomainList(std::vector& domain_list, const std::string extra_plugin_lib_paths) { static std::unique_ptr custom_op_domain = std::make_unique(); static std::vector> created_custom_op_list; - static OrtMutex mutex; - std::lock_guard lock(mutex); + static std::mutex mutex; + std::lock_guard lock(mutex); if (custom_op_domain->domain_ != "" && custom_op_domain->custom_ops_.size() > 0) { domain_list.push_back(custom_op_domain.get()); return Status::OK(); diff --git a/onnxruntime/core/providers/tvm/tvm_execution_provider.h b/onnxruntime/core/providers/tvm/tvm_execution_provider.h index e216570c2bebc..baa46c593fa07 100644 --- a/onnxruntime/core/providers/tvm/tvm_execution_provider.h +++ b/onnxruntime/core/providers/tvm/tvm_execution_provider.h @@ -11,7 +11,7 @@ #include "core/common/logging/logging.h" #include "core/framework/execution_provider.h" -#include "core/platform/ort_mutex.h" +#include #include "tvm_compiler.h" #include "tvm_runner.h" diff --git a/onnxruntime/core/providers/tvm/tvm_so_execution_provider.h b/onnxruntime/core/providers/tvm/tvm_so_execution_provider.h index e155aca6e01f0..d3840f46b5b55 100644 --- a/onnxruntime/core/providers/tvm/tvm_so_execution_provider.h +++ b/onnxruntime/core/providers/tvm/tvm_so_execution_provider.h @@ -11,7 +11,7 @@ #include "core/common/logging/logging.h" #include "core/framework/execution_provider.h" -#include "core/platform/ort_mutex.h" +#include #include "tvm_compiler.h" // NOLINT(build/include_subdir) #include "tvm_runner.h" // NOLINT(build/include_subdir) diff --git a/onnxruntime/core/providers/vitisai/imp/global_api.cc b/onnxruntime/core/providers/vitisai/imp/global_api.cc index 41885721e7b9a..772e778dd5ed4 100644 --- a/onnxruntime/core/providers/vitisai/imp/global_api.cc +++ b/onnxruntime/core/providers/vitisai/imp/global_api.cc @@ -7,7 +7,9 @@ #include #include #include - +#ifdef _WIN32 +#include +#endif #include "./vai_assert.h" #include "core/common/exceptions.h" @@ -52,6 +54,10 @@ struct OrtVitisAIEpAPI { int (*vitisai_ep_on_run_start)( const std::vector>& eps, const void* state, vaip_core::DllSafe (*get_config_entry)(const void* state, const char* entry_name)) = nullptr; + int (*vitisai_ep_set_ep_dynamic_options)( + const std::vector>& eps, + const char* const* keys, + const char* const* values, size_t kv_len) = nullptr; void Ensure() { if (handle_) return; @@ -77,6 +83,7 @@ struct OrtVitisAIEpAPI { (void**)&vaip_get_version); ORT_THROW_IF_ERROR(env.GetSymbolFromLibrary(handle_, "create_ep_context_nodes", (void**)&create_ep_context_nodes)); ORT_THROW_IF_ERROR(env.GetSymbolFromLibrary(handle_, "vitisai_ep_on_run_start", (void**)&vitisai_ep_on_run_start)); + ORT_THROW_IF_ERROR(env.GetSymbolFromLibrary(handle_, "vitisai_ep_set_ep_dynamic_options", (void**)&vitisai_ep_set_ep_dynamic_options)); } private: @@ -118,6 +125,15 @@ int vitisai_ep_on_run_start( return 100; } +int vitisai_ep_set_ep_dynamic_options( + const std::vector>& eps, const char* const* keys, + const char* const* values, size_t kv_len) { + if (s_library_vitisaiep.vitisai_ep_set_ep_dynamic_options) { + return s_library_vitisaiep.vitisai_ep_set_ep_dynamic_options(eps, keys, values, kv_len); + } + return 100; +} + struct MyCustomOpKernel : OpKernel { MyCustomOpKernel(const OpKernelInfo& info, const OrtCustomOp& op) : OpKernel(info), op_(op) { op_kernel_ = diff --git a/onnxruntime/core/providers/vitisai/include/vaip/global_api.h b/onnxruntime/core/providers/vitisai/include/vaip/global_api.h index 1a90f4c7fdebb..b0353bd6adae9 100644 --- a/onnxruntime/core/providers/vitisai/include/vaip/global_api.h +++ b/onnxruntime/core/providers/vitisai/include/vaip/global_api.h @@ -20,3 +20,7 @@ std::optional> create_ep_context_nodes( int vitisai_ep_on_run_start( const std::vector>& eps, const void* state, vaip_core::DllSafe (*get_config_entry)(const void* state, const char* entry_name)); +int vitisai_ep_set_ep_dynamic_options( + const std::vector>& eps, + const char* const* keys, + const char* const* values, size_t kv_len); diff --git a/onnxruntime/core/providers/vitisai/vitisai_execution_provider.cc b/onnxruntime/core/providers/vitisai/vitisai_execution_provider.cc index 09b115b4a57fc..633847e6f163b 100644 --- a/onnxruntime/core/providers/vitisai/vitisai_execution_provider.cc +++ b/onnxruntime/core/providers/vitisai/vitisai_execution_provider.cc @@ -110,9 +110,19 @@ common::Status VitisAIExecutionProvider::OnRunStart(const onnxruntime::RunOption }; auto error_code = vitisai_ep_on_run_start(**execution_providers_, (const void*)&run_options, get_config_entry); if (error_code) { - return Status(onnxruntime::common::ONNXRUNTIME, onnxruntime::common::StatusCode::FAIL, std::to_string(error_code)); + std::string error_msg = "vitisai_ep_on_run_start ret: " + std::to_string(error_code); + return Status(onnxruntime::common::ONNXRUNTIME, onnxruntime::common::StatusCode::FAIL, error_msg); } return Status::OK(); } +common::Status VitisAIExecutionProvider::SetEpDynamicOptions(gsl::span keys, + gsl::span values) { + auto error_code = vitisai_ep_set_ep_dynamic_options(**execution_providers_, keys.data(), values.data(), std::min(keys.size(), values.size())); + if (error_code) { + std::string error_msg = "vitisai_ep_set_ep_dynamic_options ret: " + std::to_string(error_code); + return Status(onnxruntime::common::ONNXRUNTIME, onnxruntime::common::StatusCode::FAIL, error_msg); + } + return Status::OK(); +} } // namespace onnxruntime diff --git a/onnxruntime/core/providers/vitisai/vitisai_execution_provider.h b/onnxruntime/core/providers/vitisai/vitisai_execution_provider.h index 05d2a976815b9..07085cd248d06 100644 --- a/onnxruntime/core/providers/vitisai/vitisai_execution_provider.h +++ b/onnxruntime/core/providers/vitisai/vitisai_execution_provider.h @@ -39,6 +39,8 @@ class VitisAIExecutionProvider : public IExecutionProvider { // This method is called after both `GetComputeCapabilityOps()` and `Compile()`. // This timing is required to work with both compliation-based EPs and non-compilation-based EPs. const InlinedVector GetEpContextNodes() const override; + virtual common::Status SetEpDynamicOptions(gsl::span /*keys*/, + gsl::span /*values*/) override; private: using my_ep_t = vaip_core::DllSafe>>; diff --git a/onnxruntime/core/providers/vsinpu/vsinpu_execution_provider.cc b/onnxruntime/core/providers/vsinpu/vsinpu_execution_provider.cc index 466fe1f82461c..669c702544de8 100644 --- a/onnxruntime/core/providers/vsinpu/vsinpu_execution_provider.cc +++ b/onnxruntime/core/providers/vsinpu/vsinpu_execution_provider.cc @@ -258,7 +258,7 @@ Status VSINPUExecutionProvider::Compile(const std::vector& fu compute_info.compute_func = [graph_ep, this](FunctionState /*state*/, const OrtApi* /* api */, OrtKernelContext* context) { - std::lock_guard lock(this->GetMutex()); + std::lock_guard lock(this->GetMutex()); Status res = ComputeStateFunc(graph_ep.get(), context); return res; }; diff --git a/onnxruntime/core/providers/vsinpu/vsinpu_execution_provider.h b/onnxruntime/core/providers/vsinpu/vsinpu_execution_provider.h index 44318c332fdd0..c2605eb65faee 100644 --- a/onnxruntime/core/providers/vsinpu/vsinpu_execution_provider.h +++ b/onnxruntime/core/providers/vsinpu/vsinpu_execution_provider.h @@ -43,11 +43,11 @@ class VSINPUExecutionProvider : public IExecutionProvider { std::shared_ptr GetKernelRegistry() const override; Status Compile(const std::vector& fused_nodes_and_graphs, std::vector& node_compute_funcs) override; - OrtMutex& GetMutex() { return mutex_; } + std::mutex& GetMutex() { return mutex_; } private: int device_id_; - OrtMutex mutex_; + std::mutex mutex_; }; } // namespace onnxruntime diff --git a/onnxruntime/core/providers/webnn/builders/model.h b/onnxruntime/core/providers/webnn/builders/model.h index c554dcb6f6877..b8ab6677636db 100644 --- a/onnxruntime/core/providers/webnn/builders/model.h +++ b/onnxruntime/core/providers/webnn/builders/model.h @@ -6,7 +6,7 @@ #include "core/common/inlined_containers.h" #include "core/common/status.h" -#include "core/platform/ort_mutex.h" +#include #include #include @@ -35,7 +35,7 @@ class Model { const InlinedHashMap& outputs); // Mutex for exclusive lock to this model object. - OrtMutex& GetMutex() { return mutex_; } + std::mutex& GetMutex() { return mutex_; } // Input and output names in the onnx model's order. const std::vector& GetInputs() const { return inputs_; } @@ -77,7 +77,7 @@ class Model { InlinedHashMap input_map_; InlinedHashMap output_map_; - OrtMutex mutex_; + std::mutex mutex_; bool use_dispatch_; diff --git a/onnxruntime/core/providers/webnn/webnn_execution_provider.cc b/onnxruntime/core/providers/webnn/webnn_execution_provider.cc index 2258d1ac1cd8f..1a337e185b497 100644 --- a/onnxruntime/core/providers/webnn/webnn_execution_provider.cc +++ b/onnxruntime/core/providers/webnn/webnn_execution_provider.cc @@ -291,7 +291,7 @@ common::Status WebNNExecutionProvider::Compile(const std::vector lock(model->GetMutex()); + std::unique_lock lock(model->GetMutex()); InlinedHashMap outputs; outputs.reserve(model_outputs.size()); for (size_t i = 0; i < model_outputs.size(); i++) { diff --git a/onnxruntime/core/session/inference_session.cc b/onnxruntime/core/session/inference_session.cc index 023cbcbe88d1c..f5f12c206ebad 100644 --- a/onnxruntime/core/session/inference_session.cc +++ b/onnxruntime/core/session/inference_session.cc @@ -249,7 +249,7 @@ Status GetMinimalBuildOptimizationHandling( std::atomic InferenceSession::global_session_id_{1}; std::map InferenceSession::active_sessions_; #ifdef _WIN32 -OrtMutex InferenceSession::active_sessions_mutex_; // Protects access to active_sessions_ +std::mutex InferenceSession::active_sessions_mutex_; // Protects access to active_sessions_ onnxruntime::WindowsTelemetry::EtwInternalCallback InferenceSession::callback_ML_ORT_provider_; #endif @@ -371,7 +371,7 @@ void InferenceSession::ConstructorCommon(const SessionOptions& session_options, session_id_ = global_session_id_.fetch_add(1); #ifdef _WIN32 - std::lock_guard lock(active_sessions_mutex_); + std::lock_guard lock(active_sessions_mutex_); active_sessions_[global_session_id_++] = this; // Register callback for ETW capture state (rundown) for Microsoft.ML.ONNXRuntime provider @@ -725,7 +725,7 @@ InferenceSession::~InferenceSession() { // Unregister the session and ETW callbacks #ifdef _WIN32 - std::lock_guard lock(active_sessions_mutex_); + std::lock_guard lock(active_sessions_mutex_); WindowsTelemetry::UnregisterInternalCallback(callback_ML_ORT_provider_); logging::EtwRegistrationManager::Instance().UnregisterInternalCallback(callback_ETWSink_provider_); #endif @@ -745,7 +745,7 @@ common::Status InferenceSession::RegisterExecutionProvider(const std::shared_ptr return Status(common::ONNXRUNTIME, common::FAIL, "Received nullptr for exec provider"); } - std::lock_guard l(session_mutex_); + std::lock_guard l(session_mutex_); if (is_inited_) { // adding an EP is pointless as the graph as already been partitioned so no nodes will be assigned to @@ -876,7 +876,7 @@ common::Status InferenceSession::RegisterGraphTransformer( return Status(common::ONNXRUNTIME, common::FAIL, "Received nullptr for graph transformer"); } - std::lock_guard l(session_mutex_); + std::lock_guard l(session_mutex_); if (is_inited_) { // adding a transformer now is pointless as the graph as already been transformed @@ -940,7 +940,7 @@ common::Status InferenceSession::LoadWithLoader(std::function l(session_mutex_); + std::lock_guard l(session_mutex_); if (is_model_loaded_) { // already loaded LOGS(*session_logger_, ERROR) << "This session already contains a loaded model."; return common::Status(common::ONNXRUNTIME, common::MODEL_LOADED, "This session already contains a loaded model."); @@ -1396,7 +1396,7 @@ Status InferenceSession::LoadOrtModel(const void* model_data, int model_data_len } Status InferenceSession::LoadOrtModelWithLoader(std::function load_ort_format_model_bytes) { - std::lock_guard l(session_mutex_); + std::lock_guard l(session_mutex_); if (is_model_loaded_) { // already loaded Status status(common::ONNXRUNTIME, common::MODEL_LOADED, "This session already contains a loaded model."); @@ -1520,7 +1520,7 @@ Status InferenceSession::LoadOrtModelWithLoader(std::function load_ort } bool InferenceSession::IsInitialized() const { - std::lock_guard l(session_mutex_); + std::lock_guard l(session_mutex_); return is_inited_; } @@ -1673,7 +1673,7 @@ common::Status InferenceSession::Initialize() { bool have_cpu_ep = false; { - std::lock_guard initial_guard(session_mutex_); + std::lock_guard initial_guard(session_mutex_); if (!is_model_loaded_) { LOGS(*session_logger_, ERROR) << "Model was not loaded"; @@ -1711,7 +1711,7 @@ common::Status InferenceSession::Initialize() { } // re-acquire mutex - std::lock_guard l(session_mutex_); + std::lock_guard l(session_mutex_); #if !defined(DISABLE_EXTERNAL_INITIALIZERS) && !defined(ORT_MINIMAL_BUILD) if (!session_options_.external_initializers.empty()) { @@ -2584,7 +2584,7 @@ Status InferenceSession::Run(const RunOptions& run_options, std::unique_ptr owned_run_logger; const auto& run_logger = CreateLoggerForRun(run_options, owned_run_logger); - std::optional> sequential_run_lock; + std::optional> sequential_run_lock; if (is_concurrent_run_supported_ == false) { sequential_run_lock.emplace(session_mutex_); } @@ -2837,7 +2837,7 @@ common::Status InferenceSession::Run(const RunOptions& run_options, const NameML std::pair InferenceSession::GetModelMetadata() const { { - std::lock_guard l(session_mutex_); + std::lock_guard l(session_mutex_); if (!is_model_loaded_) { LOGS(*session_logger_, ERROR) << "Model was not loaded"; return std::make_pair(common::Status(common::ONNXRUNTIME, common::FAIL, "Model was not loaded."), nullptr); @@ -2849,7 +2849,7 @@ std::pair InferenceSession::GetModelMetada std::pair InferenceSession::GetModelInputs() const { { - std::lock_guard l(session_mutex_); + std::lock_guard l(session_mutex_); if (!is_model_loaded_) { LOGS(*session_logger_, ERROR) << "Model was not loaded"; return std::make_pair(common::Status(common::ONNXRUNTIME, common::FAIL, "Model was not loaded."), nullptr); @@ -2862,7 +2862,7 @@ std::pair InferenceSession::GetModelInputs( std::pair InferenceSession::GetOverridableInitializers() const { { - std::lock_guard l(session_mutex_); + std::lock_guard l(session_mutex_); if (!is_model_loaded_) { LOGS(*session_logger_, ERROR) << "Model was not loaded"; return std::make_pair(common::Status(common::ONNXRUNTIME, common::FAIL, "Model was not loaded."), nullptr); @@ -2875,7 +2875,7 @@ std::pair InferenceSession::GetOverridableI std::pair InferenceSession::GetModelOutputs() const { { - std::lock_guard l(session_mutex_); + std::lock_guard l(session_mutex_); if (!is_model_loaded_) { LOGS(*session_logger_, ERROR) << "Model was not loaded"; return std::make_pair(common::Status(common::ONNXRUNTIME, common::FAIL, "Model was not loaded."), nullptr); @@ -2887,7 +2887,7 @@ std::pair InferenceSession::GetModelOutput common::Status InferenceSession::NewIOBinding(std::unique_ptr* io_binding) { { - std::lock_guard l(session_mutex_); + std::lock_guard l(session_mutex_); if (!is_inited_) { LOGS(*session_logger_, ERROR) << "Session was not initialized"; return common::Status(common::ONNXRUNTIME, common::FAIL, "Session not initialized."); @@ -3271,7 +3271,7 @@ IOBinding* SessionIOBinding::Get() { void InferenceSession::LogAllSessions() { const Env& env = Env::Default(); - std::lock_guard lock(active_sessions_mutex_); + std::lock_guard lock(active_sessions_mutex_); for (const auto& session_pair : active_sessions_) { InferenceSession* session = session_pair.second; diff --git a/onnxruntime/core/session/inference_session.h b/onnxruntime/core/session/inference_session.h index 322c1917b9eaf..424248da793f1 100644 --- a/onnxruntime/core/session/inference_session.h +++ b/onnxruntime/core/session/inference_session.h @@ -29,7 +29,7 @@ #include "core/optimizer/graph_transformer_level.h" #include "core/optimizer/graph_transformer_mgr.h" #include "core/optimizer/insert_cast_transformer.h" -#include "core/platform/ort_mutex.h" +#include #ifdef ENABLE_LANGUAGE_INTEROP_OPS #include "core/language_interop_ops/language_interop_ops.h" #endif @@ -129,7 +129,7 @@ class InferenceSession { using InputOutputDefMetaMap = InlinedHashMap; static std::map active_sessions_; #ifdef _WIN32 - static OrtMutex active_sessions_mutex_; // Protects access to active_sessions_ + static std::mutex active_sessions_mutex_; // Protects access to active_sessions_ static onnxruntime::WindowsTelemetry::EtwInternalCallback callback_ML_ORT_provider_; onnxruntime::logging::EtwRegistrationManager::EtwInternalCallback callback_ETWSink_provider_; #endif @@ -799,10 +799,10 @@ class InferenceSession { // Number of concurrently running executors std::atomic current_num_runs_ = 0; - mutable onnxruntime::OrtMutex session_mutex_; // to ensure only one thread can invoke Load/Initialize - bool is_model_loaded_ = false; // GUARDED_BY(session_mutex_) - bool is_inited_ = false; // GUARDED_BY(session_mutex_) - bool is_concurrent_run_supported_ = true; // Graph execution in Run is GUARDED_BY(session_mutex_) if false + mutable std::mutex session_mutex_; // to ensure only one thread can invoke Load/Initialize + bool is_model_loaded_ = false; // GUARDED_BY(session_mutex_) + bool is_inited_ = false; // GUARDED_BY(session_mutex_) + bool is_concurrent_run_supported_ = true; // Graph execution in Run is GUARDED_BY(session_mutex_) if false #ifdef ENABLE_LANGUAGE_INTEROP_OPS InterOpDomains interop_domains_; diff --git a/onnxruntime/core/session/onnxruntime_c_api.cc b/onnxruntime/core/session/onnxruntime_c_api.cc index 8280270a768f0..109445c877786 100644 --- a/onnxruntime/core/session/onnxruntime_c_api.cc +++ b/onnxruntime/core/session/onnxruntime_c_api.cc @@ -36,7 +36,7 @@ #include "core/framework/data_types.h" #include "abi_session_options_impl.h" #include "core/framework/TensorSeq.h" -#include "core/platform/ort_mutex.h" +#include #include "core/common/string_helper.h" #include "core/session/lora_adapters.h" diff --git a/onnxruntime/core/session/ort_env.cc b/onnxruntime/core/session/ort_env.cc index 3c178fd1e91d3..ef84875df18a3 100644 --- a/onnxruntime/core/session/ort_env.cc +++ b/onnxruntime/core/session/ort_env.cc @@ -19,7 +19,7 @@ using namespace onnxruntime::logging; std::unique_ptr OrtEnv::p_instance_; int OrtEnv::ref_count_ = 0; -onnxruntime::OrtMutex OrtEnv::m_; +std::mutex OrtEnv::m_; OrtEnv::OrtEnv(std::unique_ptr value1) : value_(std::move(value1)) { @@ -35,7 +35,7 @@ OrtEnv::~OrtEnv() { OrtEnv* OrtEnv::GetInstance(const OrtEnv::LoggingManagerConstructionInfo& lm_info, onnxruntime::common::Status& status, const OrtThreadingOptions* tp_options) { - std::lock_guard lock(m_); + std::lock_guard lock(m_); if (!p_instance_) { std::unique_ptr lmgr; std::string name = lm_info.logid; @@ -76,7 +76,7 @@ void OrtEnv::Release(OrtEnv* env_ptr) { if (!env_ptr) { return; } - std::lock_guard lock(m_); + std::lock_guard lock(m_); ORT_ENFORCE(env_ptr == p_instance_.get()); // sanity check --ref_count_; if (ref_count_ == 0) { diff --git a/onnxruntime/core/session/ort_env.h b/onnxruntime/core/session/ort_env.h index 444134d0612e9..64e0020f2930d 100644 --- a/onnxruntime/core/session/ort_env.h +++ b/onnxruntime/core/session/ort_env.h @@ -5,7 +5,7 @@ #include #include #include "core/session/onnxruntime_c_api.h" -#include "core/platform/ort_mutex.h" +#include #include "core/common/status.h" #include "core/common/logging/logging.h" #include "core/framework/allocator.h" @@ -67,7 +67,7 @@ struct OrtEnv { private: static std::unique_ptr p_instance_; - static onnxruntime::OrtMutex m_; + static std::mutex m_; static int ref_count_; std::unique_ptr value_; diff --git a/onnxruntime/python/tools/transformers/models/stable_diffusion/README.md b/onnxruntime/python/tools/transformers/models/stable_diffusion/README.md index 9c1c31626066d..edef0d3ee5453 100644 --- a/onnxruntime/python/tools/transformers/models/stable_diffusion/README.md +++ b/onnxruntime/python/tools/transformers/models/stable_diffusion/README.md @@ -40,9 +40,8 @@ docker run --rm -it --gpus all -v $PWD:/workspace nvcr.io/nvidia/pytorch:24.04-p ``` #### Build onnxruntime from source -The cuDNN in the container might not be compatible with official onnxruntime-gpu package, it is recommended to build from source instead. +This step is optional. Please look at [install onnxruntime-gpu](https://onnxruntime.ai/docs/install/#python-installs) if you do not want to build from source. -After launching the docker, you can build and install onnxruntime-gpu wheel like the following. ``` export CUDACXX=/usr/local/cuda/bin/nvcc git config --global --add safe.directory '*' @@ -60,9 +59,17 @@ If the GPU is not A100, change `CMAKE_CUDA_ARCHITECTURES=80` in the command line If your machine has less than 64GB memory, replace `--parallel` by `--parallel 4 --nvcc_threads 1 ` to avoid out of memory. #### Install required packages +First, remove older version of opencv to avoid error like `module 'cv2.dnn' has no attribute 'DictValue'`: +``` +pip uninstall -y $(pip list --format=freeze | grep opencv) +rm -rf /usr/local/lib/python3.10/dist-packages/cv2/ +apt-get update +DEBIAN_FRONTEND="noninteractive" apt-get install --yes python3-opencv +``` + ``` cd /workspace/onnxruntime/python/tools/transformers/models/stable_diffusion -python3 -m pip install -r requirements-cuda12.txt +python3 -m pip install -r requirements/cuda12/requirements.txt python3 -m pip install --upgrade polygraphy onnx-graphsurgeon --extra-index-url https://pypi.ngc.nvidia.com ``` @@ -136,15 +143,18 @@ conda activate py310 ### Setup Environment (CUDA) without docker -First, we need install CUDA 11.8 or 12.1, [cuDNN](https://docs.nvidia.com/deeplearning/cudnn/install-guide/index.html) 8.5 or above, and [TensorRT 8.6.1](https://docs.nvidia.com/deeplearning/tensorrt/install-guide/index.html) in the machine. +First, we need install CUDA 11.8 or 12.x, [cuDNN](https://docs.nvidia.com/deeplearning/cudnn/install-guide/index.html), and [TensorRT](https://docs.nvidia.com/deeplearning/tensorrt/install-guide/index.html) in the machine. + +The verison of CuDNN can be found in https://onnxruntime.ai/docs/execution-providers/CUDA-ExecutionProvider.html#requirements. +The version of TensorRT can be found in https://onnxruntime.ai/docs/execution-providers/TensorRT-ExecutionProvider.html#requirements. #### CUDA 11.8: -In the Conda environment, install PyTorch 2.1 or above, and other required packages like the following: +In the Conda environment, install PyTorch 2.1 up to 2.3.1, and other required packages like the following: ``` -pip install torch --index-url https://download.pytorch.org/whl/cu118 +pip install torch>=2.1,<2.4 --index-url https://download.pytorch.org/whl/cu118 pip install --upgrade polygraphy onnx-graphsurgeon --extra-index-url https://pypi.ngc.nvidia.com -pip install -r requirements-cuda11.txt +pip install -r requirements/cuda11/requirements.txt ``` For Windows, install nvtx like the following: @@ -157,77 +167,40 @@ We cannot directly `pip install tensorrt` for CUDA 11. Follow https://github.com For Windows, pip install the tensorrt wheel in the downloaded TensorRT zip file instead. Like `pip install tensorrt-8.6.1.6.windows10.x86_64.cuda-11.8\tensorrt-8.6.1.6\python\tensorrt-8.6.1-cp310-none-win_amd64.whl`. #### CUDA 12.*: -The official package of onnxruntime-gpu 1.16.* is built for CUDA 11.8. To use CUDA 12.*, you will need [build onnxruntime from source](https://onnxruntime.ai/docs/build/inferencing.html). - -``` -git clone --recursive https://github.com/Microsoft/onnxruntime.git -cd onnxruntime -pip install cmake -pip install -r requirements-dev.txt -``` -Follow [example script for A100 in Ubuntu](https://github.com/microsoft/onnxruntime/blob/26a7b63716e3125bfe35fe3663ba10d2d7322628/build_release.sh) -or [example script for RTX 4090 in Windows](https://github.com/microsoft/onnxruntime/blob/8df5f4e0df1f3b9ceeb0f1f2561b09727ace9b37/build_trt.cmd) to build and install onnxruntime-gpu wheel. - -Then install other python packages like the following: +The official package of onnxruntime-gpu 1.19.x is built for CUDA 12.x. You can install it and other python packages like the following: ``` -pip install torch --index-url https://download.pytorch.org/whl/cu121 +pip install onnxruntime-gpu +pip install torch --index-url https://download.pytorch.org/whl/cu124 pip install --upgrade polygraphy onnx-graphsurgeon --extra-index-url https://pypi.ngc.nvidia.com -pip install -r requirements-cuda12.txt +pip install -r requirements/cuda12/requirements.txt ``` Finally, `pip install tensorrt` for Linux. For Windows, pip install the tensorrt wheel in the downloaded TensorRT zip file instead. ### Setup Environment (ROCm) -It is recommended that the users run the model with ROCm 5.4 or newer and Python 3.10. +It is recommended that the users run the model with ROCm 6.2 or newer and Python 3.10. You can follow the following to install ROCm 6.x: https://rocmdocs.amd.com/projects/install-on-linux/en/latest/install/quick-start.html Note that Windows is not supported for ROCm at the moment. ``` -wget https://repo.radeon.com/rocm/manylinux/rocm-rel-5.4/torch-1.12.1%2Brocm5.4-cp38-cp38-linux_x86_64.whl -pip install torch-1.12.1+rocm5.4-cp38-cp38-linux_x86_64.whl -pip install -r requirements-rocm.txt +pip install -r requirements/rocm/requirements.txt ``` -AMD GPU version of PyTorch can be installed from [pytorch.org](https://pytorch.org/get-started/locally/) or [AMD Radeon repo](https://repo.radeon.com/rocm/manylinux/rocm-rel-5.4/). +AMD GPU version of PyTorch can be installed from [pytorch.org](https://pytorch.org/get-started/locally/) or [AMD Radeon repo](https://repo.radeon.com/rocm/manylinux/rocm-rel-6.2.3/). #### Install onnxruntime-rocm -Here is an example to build onnxruntime from source with Rocm 5.4.2 in Ubuntu 20.04, and install the wheel. - -(1) Install [ROCm 5.4.2](https://docs.amd.com/bundle/ROCm-Installation-Guide-v5.4.2/page/How_to_Install_ROCm.html). Note that the version is also used in PyTorch 2.0 ROCm package. - -(2) Install some tools used in build: -``` -sudo apt-get update -sudo apt-get install -y --no-install-recommends \ - wget \ - zip \ - ca-certificates \ - build-essential \ - curl \ - libcurl4-openssl-dev \ - libssl-dev \ - python3-dev -pip install numpy packaging "wheel>=0.35.1" -wget --quiet https://github.com/Kitware/CMake/releases/download/v3.26.3/cmake-3.26.3-linux-x86_64.tar.gz -tar zxf cmake-3.26.3-linux-x86_64.tar.gz -export PATH=${PWD}/cmake-3.26.3-linux-x86_64/bin:${PATH} -``` - -(3) Build and Install ONNX Runtime +One option is to install prebuilt wheel from https://repo.radeon.com/rocm/manylinux like: ``` -git clone https://github.com/microsoft/onnxruntime -cd onnxruntime -sh build.sh --config Release --use_rocm --rocm_home /opt/rocm --rocm_version 5.4.2 --build_wheel -pip install build/Linux/Release/dist/*.whl +wget https://repo.radeon.com/rocm/manylinux/rocm-rel-6.2.3/onnxruntime_rocm-1.18.0-cp310-cp310-linux_x86_64.whl +pip install onnxruntime_rocm-1.18.0-cp310-cp310-linux_x86_64.whl ``` -You can also follow the [official docs](https://onnxruntime.ai/docs/build/eps.html#amd-rocm) to build with docker. +If you want to use latest version of onnxruntime, you can build from source with Rocm 6.x following https://onnxruntime.ai/docs/build/eps.html#amd-rocm. +When the build is finished, you can install the wheel:`pip install build/Linux/Release/dist/*.whl`. ### Export ONNX pipeline This step will export stable diffusion 1.5 to ONNX model in float32 using script from diffusers. -It is recommended to use PyTorch 1.12.1 or 1.13.1 in this step. Using PyTorch 2.0 will encounter issue in exporting onnx. - ``` curl https://raw.githubusercontent.com/huggingface/diffusers/v0.15.1/scripts/convert_stable_diffusion_checkpoint_to_onnx.py > convert_sd_onnx.py python convert_sd_onnx.py --model_path runwayml/stable-diffusion-v1-5 --output_path ./sd_v1_5/fp32 diff --git a/onnxruntime/python/tools/transformers/models/stable_diffusion/requirements-rocm.txt b/onnxruntime/python/tools/transformers/models/stable_diffusion/requirements-rocm.txt deleted file mode 100644 index c0a925e25b941..0000000000000 --- a/onnxruntime/python/tools/transformers/models/stable_diffusion/requirements-rocm.txt +++ /dev/null @@ -1,5 +0,0 @@ --r requirements.txt -# Install onnxruntime-rocm or onnxruntime_training -# Build onnxruntime-rocm from source -# Directly install pre-built onnxruntime/onnxruntime-training rocm python package is not possible at the moment. -# TODO: update once we have public pre-built packages diff --git a/onnxruntime/python/tools/transformers/models/stable_diffusion/requirements-cuda12.txt b/onnxruntime/python/tools/transformers/models/stable_diffusion/requirements/cuda11/requirements.txt similarity index 64% rename from onnxruntime/python/tools/transformers/models/stable_diffusion/requirements-cuda12.txt rename to onnxruntime/python/tools/transformers/models/stable_diffusion/requirements/cuda11/requirements.txt index 4aa88cdf92309..bbc62ca4cbd18 100644 --- a/onnxruntime/python/tools/transformers/models/stable_diffusion/requirements-cuda12.txt +++ b/onnxruntime/python/tools/transformers/models/stable_diffusion/requirements/cuda11/requirements.txt @@ -1,13 +1,13 @@ --r requirements.txt +-r ../requirements.txt -# For CUDA 12.*, you will need build onnxruntime-gpu from source and install the wheel. See README.md for detail. +# See https://onnxruntime.ai/docs/install/#python-installs for installation. The latest one in pypi is for cuda 12. # onnxruntime-gpu>=1.16.2 py3nvml # The version of cuda-python shall be compatible with installed CUDA version. # For demo of TensorRT excution provider and TensortRT. -cuda-python>=12.1.0 +cuda-python==11.8.0 # For windows, cuda-python need the following pywin32; platform_system == "Windows" @@ -15,8 +15,8 @@ pywin32; platform_system == "Windows" # For windows, run `conda install -c conda-forge nvtx` instead nvtx; platform_system != "Windows" -# Please install PyTorch 2.1 or above for 12.1 using one of the following commands: -# pip3 install torch --index-url https://download.pytorch.org/whl/cu121 +# Please install PyTorch >=2.1 and <2.4 for CUDA 11.8 like the following: +# pip install torch==2.3.1 --index-url https://download.pytorch.org/whl/cu118 # Run the following command to install some extra packages for onnx graph optimization for TensorRT manually. # pip3 install --upgrade polygraphy onnx-graphsurgeon --extra-index-url https://pypi.ngc.nvidia.com diff --git a/onnxruntime/python/tools/transformers/models/stable_diffusion/requirements-cuda11.txt b/onnxruntime/python/tools/transformers/models/stable_diffusion/requirements/cuda12/requirements.txt similarity index 73% rename from onnxruntime/python/tools/transformers/models/stable_diffusion/requirements-cuda11.txt rename to onnxruntime/python/tools/transformers/models/stable_diffusion/requirements/cuda12/requirements.txt index dc6592fc2fa54..89562e920ac00 100644 --- a/onnxruntime/python/tools/transformers/models/stable_diffusion/requirements-cuda11.txt +++ b/onnxruntime/python/tools/transformers/models/stable_diffusion/requirements/cuda12/requirements.txt @@ -1,13 +1,12 @@ --r requirements.txt +-r ../requirements.txt -# Official onnxruntime-gpu 1.16.1 is built with CUDA 11.8. -onnxruntime-gpu>=1.16.2 +onnxruntime-gpu>=1.19.2 py3nvml # The version of cuda-python shall be compatible with installed CUDA version. # For demo of TensorRT excution provider and TensortRT. -cuda-python==11.8.0 +cuda-python>=12.1.0 # For windows, cuda-python need the following pywin32; platform_system == "Windows" @@ -15,8 +14,8 @@ pywin32; platform_system == "Windows" # For windows, run `conda install -c conda-forge nvtx` instead nvtx; platform_system != "Windows" -# Please install PyTorch 2.1 or above for CUDA 11.8 using one of the following commands: -# pip3 install torch --index-url https://download.pytorch.org/whl/cu118 +# Please install PyTorch 2.4 or above using one of the following commands: +# pip3 install torch --index-url https://download.pytorch.org/whl/cu124 # Run the following command to install some extra packages for onnx graph optimization for TensorRT manually. # pip3 install --upgrade polygraphy onnx-graphsurgeon --extra-index-url https://pypi.ngc.nvidia.com diff --git a/onnxruntime/python/tools/transformers/models/stable_diffusion/requirements.txt b/onnxruntime/python/tools/transformers/models/stable_diffusion/requirements/requirements.txt similarity index 63% rename from onnxruntime/python/tools/transformers/models/stable_diffusion/requirements.txt rename to onnxruntime/python/tools/transformers/models/stable_diffusion/requirements/requirements.txt index 5080737516c53..8c9f0ba0f21be 100644 --- a/onnxruntime/python/tools/transformers/models/stable_diffusion/requirements.txt +++ b/onnxruntime/python/tools/transformers/models/stable_diffusion/requirements/requirements.txt @@ -1,3 +1,4 @@ +huggingface_hub==0.25.2 diffusers==0.28.0 transformers==4.41.2 numpy>=1.24.1 @@ -14,6 +15,4 @@ controlnet_aux==0.0.9 optimum==1.20.0 safetensors invisible_watermark -# newer version of opencv-python migth encounter module 'cv2.dnn' has no attribute 'DictValue' error -opencv-python==4.8.0.74 -opencv-python-headless==4.8.0.74 +opencv-python-headless diff --git a/onnxruntime/python/tools/transformers/models/stable_diffusion/requirements/rocm/requirements.txt b/onnxruntime/python/tools/transformers/models/stable_diffusion/requirements/rocm/requirements.txt new file mode 100644 index 0000000000000..21b100fb61f17 --- /dev/null +++ b/onnxruntime/python/tools/transformers/models/stable_diffusion/requirements/rocm/requirements.txt @@ -0,0 +1,2 @@ +-r ../requirements.txt +# Install onnxruntime-rocm that is built from source (https://onnxruntime.ai/docs/build/eps.html#amd-rocm) diff --git a/onnxruntime/test/onnx/TestCase.cc b/onnxruntime/test/onnx/TestCase.cc index 45aaca1ceae56..e564443ed8eb0 100644 --- a/onnxruntime/test/onnx/TestCase.cc +++ b/onnxruntime/test/onnx/TestCase.cc @@ -25,7 +25,7 @@ #include "core/common/logging/logging.h" #include "core/common/common.h" #include "core/platform/env.h" -#include "core/platform/ort_mutex.h" +#include #include "core/platform/path_lib.h" #include "core/session/onnxruntime_cxx_api.h" #include "core/framework/allocator.h" @@ -288,12 +288,12 @@ class OnnxTestCase : public ITestCase { private: std::string test_case_name_; mutable std::vector debuginfo_strings_; - mutable onnxruntime::OrtMutex m_; + mutable std::mutex m_; std::vector test_data_dirs_; std::string GetDatasetDebugInfoString(size_t dataset_id) const override { - std::lock_guard l(m_); + std::lock_guard l(m_); if (dataset_id < debuginfo_strings_.size()) { return debuginfo_strings_[dataset_id]; } @@ -488,7 +488,7 @@ void OnnxTestCase::LoadTestData(size_t id, onnxruntime::test::HeapBuffer& b, if (st.IsOK()) { // has an all-in-one input file std::ostringstream oss; { - std::lock_guard l(m_); + std::lock_guard l(m_); oss << debuginfo_strings_[id]; } ORT_TRY { @@ -503,7 +503,7 @@ void OnnxTestCase::LoadTestData(size_t id, onnxruntime::test::HeapBuffer& b, } { - std::lock_guard l(m_); + std::lock_guard l(m_); debuginfo_strings_[id] = oss.str(); } return; @@ -1026,7 +1026,13 @@ std::unique_ptr> GetBrokenTests(const std::string& provider {"dequantizelinear_int4", "Bug with model input name 'zero_point' not matching node's input name", {}}, {"dequantizelinear_uint4", "Bug with model input name 'zero_point' not matching node's input name", {}}, {"quantizelinear_int4", "Bug with model input name 'zero_point' not matching node's input name", {}}, - {"quantizelinear_uint4", "Bug with model input name 'zero_point' not matching node's input name", {}}}); + {"quantizelinear_uint4", "Bug with model input name 'zero_point' not matching node's input name", {}}, + {"qlinearmatmul_2D_int8_float16", "fp16 type ont supported by CPU EP", {}}, + {"qlinearmatmul_2D_int8_float32", "result diff", {}}, + {"qlinearmatmul_2D_uint8_float16", "fp16 type ont supported by CPU EP", {}}, + {"qlinearmatmul_3D_int8_float16", "fp16 type ont supported by CPU EP", {}}, + {"qlinearmatmul_3D_int8_float32", "result diff", {}}, + {"qlinearmatmul_3D_uint8_float16", "fp16 type ont supported by CPU EP", {}}}); // Some EPs may fail to pass some specific testcases. // For example TenosrRT EP may fail on FLOAT16 related testcases if GPU doesn't support float16. diff --git a/onnxruntime/test/onnx/TestResultStat.h b/onnxruntime/test/onnx/TestResultStat.h index 5bfc04c3cd577..0804b1d7a4139 100644 --- a/onnxruntime/test/onnx/TestResultStat.h +++ b/onnxruntime/test/onnx/TestResultStat.h @@ -7,7 +7,7 @@ #include #include #include -#include +#include #include #include @@ -26,22 +26,22 @@ class TestResultStat { TestResultStat() : succeeded(0), not_implemented(0), load_model_failed(0), throwed_exception(0), result_differs(0), skipped(0), invalid_graph(0) {} void AddNotImplementedKernels(const std::string& s) { - std::lock_guard l(m_); + std::lock_guard l(m_); not_implemented_kernels.insert(s); } void AddFailedKernels(const std::string& s) { - std::lock_guard l(m_); + std::lock_guard l(m_); failed_kernels.insert(s); } void AddFailedTest(const std::pair& p) { - std::lock_guard l(m_); + std::lock_guard l(m_); failed_test_cases.insert(p); } const std::set>& GetFailedTest() const { - std::lock_guard l(m_); + std::lock_guard l(m_); return failed_test_cases; } @@ -74,7 +74,7 @@ class TestResultStat { } private: - mutable onnxruntime::OrtMutex m_; + mutable std::mutex m_; std::unordered_set not_implemented_kernels; std::unordered_set failed_kernels; std::set> failed_test_cases; // pairs of test name and version diff --git a/onnxruntime/test/onnx/onnxruntime_event.h b/onnxruntime/test/onnx/onnxruntime_event.h index b830a9f888edb..a7cfbccad3d8a 100644 --- a/onnxruntime/test/onnx/onnxruntime_event.h +++ b/onnxruntime/test/onnx/onnxruntime_event.h @@ -2,12 +2,12 @@ // Licensed under the MIT License. #include -#include +#include struct OnnxRuntimeEvent { public: - onnxruntime::OrtMutex finish_event_mutex; - onnxruntime::OrtCondVar finish_event_data; + std::mutex finish_event_mutex; + std::condition_variable finish_event_data; bool finished = false; OnnxRuntimeEvent() = default; diff --git a/onnxruntime/test/perftest/performance_runner.cc b/onnxruntime/test/perftest/performance_runner.cc index 08d77008dc25c..faf0c34193717 100644 --- a/onnxruntime/test/perftest/performance_runner.cc +++ b/onnxruntime/test/perftest/performance_runner.cc @@ -189,8 +189,8 @@ Status PerformanceRunner::RunParallelDuration() { // TODO: Make each thread enqueue a new worker. auto tpool = GetDefaultThreadPool(Env::Default()); std::atomic counter = {0}; - OrtMutex m; - OrtCondVar cv; + std::mutex m; + std::condition_variable cv; auto start = std::chrono::high_resolution_clock::now(); auto end = start; @@ -206,7 +206,7 @@ Status PerformanceRunner::RunParallelDuration() { if (!status.IsOK()) std::cerr << status.ErrorMessage(); // Simplified version of Eigen::Barrier - std::lock_guard lg(m); + std::lock_guard lg(m); counter--; cv.notify_all(); }); @@ -216,7 +216,7 @@ Status PerformanceRunner::RunParallelDuration() { } while (duration_seconds.count() < performance_test_config_.run_config.duration_in_seconds); // Join - std::unique_lock lock(m); + std::unique_lock lock(m); cv.wait(lock, [&counter]() { return counter == 0; }); return Status::OK(); @@ -228,8 +228,8 @@ Status PerformanceRunner::ForkJoinRepeat() { // create a threadpool with one thread per concurrent request auto tpool = std::make_unique(run_config.concurrent_session_runs); std::atomic counter{0}, requests{0}; - OrtMutex m; - OrtCondVar cv; + std::mutex m; + std::condition_variable cv; // Fork for (size_t i = 0; i != run_config.concurrent_session_runs; ++i) { @@ -242,14 +242,14 @@ Status PerformanceRunner::ForkJoinRepeat() { } // Simplified version of Eigen::Barrier - std::lock_guard lg(m); + std::lock_guard lg(m); counter--; cv.notify_all(); }); } // Join - std::unique_lock lock(m); + std::unique_lock lock(m); cv.wait(lock, [&counter]() { return counter == 0; }); return Status::OK(); diff --git a/onnxruntime/test/perftest/performance_runner.h b/onnxruntime/test/perftest/performance_runner.h index cb1cb661550a7..b0a0161e7fd6c 100644 --- a/onnxruntime/test/perftest/performance_runner.h +++ b/onnxruntime/test/perftest/performance_runner.h @@ -14,7 +14,7 @@ #include #include #include -#include +#include #include #include "test_configuration.h" #include "heap_buffer.h" @@ -75,7 +75,7 @@ class PerformanceRunner { ORT_RETURN_IF_ERROR(status); if (!isWarmup) { - std::lock_guard guard(results_mutex_); + std::lock_guard guard(results_mutex_); performance_result_.time_costs.emplace_back(duration_seconds.count()); performance_result_.total_time_cost += duration_seconds.count(); if (performance_test_config_.run_config.f_verbose) { @@ -116,7 +116,7 @@ class PerformanceRunner { onnxruntime::test::HeapBuffer b_; std::unique_ptr test_case_; - OrtMutex results_mutex_; + std::mutex results_mutex_; }; } // namespace perftest } // namespace onnxruntime diff --git a/onnxruntime/test/platform/threadpool_test.cc b/onnxruntime/test/platform/threadpool_test.cc index 9b3eac1088a47..e0e6c0603c784 100644 --- a/onnxruntime/test/platform/threadpool_test.cc +++ b/onnxruntime/test/platform/threadpool_test.cc @@ -3,7 +3,7 @@ #include "core/platform/threadpool.h" #include "core/platform/EigenNonBlockingThreadPool.h" -#include "core/platform/ort_mutex.h" +#include #include "core/util/thread_utils.h" #ifdef _WIN32 #include "test/platform/windows/env.h" @@ -27,7 +27,7 @@ struct TestData { explicit TestData(int num) : data(num, 0) { } std::vector data; - onnxruntime::OrtMutex mutex; + std::mutex mutex; }; // This unittest tests ThreadPool function by counting the number of calls to function with each index. @@ -38,7 +38,7 @@ std::unique_ptr CreateTestData(int num) { } void IncrementElement(TestData& test_data, ptrdiff_t i) { - std::lock_guard lock(test_data.mutex); + std::lock_guard lock(test_data.mutex); test_data.data[i]++; } diff --git a/onnxruntime/test/providers/cpu/math/quantize_linear_matmul_test.cc b/onnxruntime/test/providers/cpu/math/quantize_linear_matmul_test.cc index 8cdb837712e83..096263792727a 100644 --- a/onnxruntime/test/providers/cpu/math/quantize_linear_matmul_test.cc +++ b/onnxruntime/test/providers/cpu/math/quantize_linear_matmul_test.cc @@ -126,8 +126,8 @@ TEST(QuantizeLinearMatmulOpTest, QLinearMatMul3D_S8S8) { } TEST(QuantizeLinearMatmulOpTest, QLinearMatMul2D_U8U8) { - auto run_test = [](bool only_t1_not_initializer) { - OpTester test("QLinearMatMul", 10); + auto run_test = [](bool only_t1_not_initializer, int opset_version) { + OpTester test("QLinearMatMul", opset_version); test.AddInput("T1", {2, 4}, {208, 236, 0, 238, 3, 214, 255, 29}); @@ -155,10 +155,12 @@ TEST(QuantizeLinearMatmulOpTest, QLinearMatMul2D_U8U8) { test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kOpenVINOExecutionProvider}); }; - run_test(false); + run_test(false, 10); + run_test(false, 21); // NNAPI will require all inputs except T1 to be initializers - run_test(true); + run_test(true, 10); + run_test(true, 21); } TEST(QuantizeLinearMatmulOpTest, QLinearMatMul2D_U8S8) { @@ -197,8 +199,8 @@ TEST(QuantizeLinearMatmulOpTest, QLinearMatMul2D_U8S8) { } TEST(QuantizeLinearMatmulOpTest, QLinearMatMul2D_S8S8) { - auto run_test = [](bool only_t1_not_initializer) { - OpTester test("QLinearMatMul", 10); + auto run_test = [](bool only_t1_not_initializer, int opset_version) { + OpTester test("QLinearMatMul", opset_version); test.AddInput("T1", {2, 4}, {80, -2, -128, 110, -125, 86, 127, -99}); @@ -225,10 +227,12 @@ TEST(QuantizeLinearMatmulOpTest, QLinearMatMul2D_S8S8) { test.Run(); }; - run_test(false); + run_test(false, 10); + run_test(false, 21); // NNAPI will require all inputs except T1 to be initializers - run_test(true); + run_test(true, 10); + run_test(true, 21); } static void QLinearMatMul2DTest(bool only_t1_not_initializer) { diff --git a/orttraining/orttraining/training_ops/cuda/nn/conv_shared.cc b/orttraining/orttraining/training_ops/cuda/nn/conv_shared.cc index 9b30bd128b161..d4f7fbf2080ce 100644 --- a/orttraining/orttraining/training_ops/cuda/nn/conv_shared.cc +++ b/orttraining/orttraining/training_ops/cuda/nn/conv_shared.cc @@ -3,7 +3,7 @@ #include "orttraining/training_ops/cuda/nn/conv_shared.h" -#include "core/platform/ort_mutex.h" +#include #include "core/providers/common.h" #include "core/providers/cuda/cuda_kernel.h" @@ -65,11 +65,11 @@ std::vector GetValidAlgorithms(const T_Perf* perf_results, int n_algo) { template struct AlgoPerfCache { - mutable OrtMutex mutex; + mutable std::mutex mutex; std::unordered_map map; bool Find(const ConvParams& params, T_Perf* result) { - std::lock_guard guard(mutex); + std::lock_guard guard(mutex); auto it = map.find(params); if (it == map.end()) { return false; @@ -79,7 +79,7 @@ struct AlgoPerfCache { } void Insert(const ConvParams& params, const T_Perf& algo_perf) { - std::lock_guard guard(mutex); + std::lock_guard guard(mutex); map[params] = algo_perf; } }; diff --git a/orttraining/orttraining/training_ops/rocm/nn/conv_grad.cc b/orttraining/orttraining/training_ops/rocm/nn/conv_grad.cc index 22fa5b6f55a5d..3b1ed29cb0240 100644 --- a/orttraining/orttraining/training_ops/rocm/nn/conv_grad.cc +++ b/orttraining/orttraining/training_ops/rocm/nn/conv_grad.cc @@ -7,7 +7,7 @@ #include "core/providers/common.h" #include "core/providers/rocm/shared_inc/fpgeneric.h" -#include "core/platform/ort_mutex.h" +#include namespace onnxruntime { namespace rocm { @@ -96,11 +96,11 @@ struct ConvParamsEqual { template struct AlgoPerfCache { - mutable OrtMutex mutex; + mutable std::mutex mutex; std::unordered_map map; bool Find(const ConvParams& params, T_Perf* result) { - std::lock_guard guard(mutex); + std::lock_guard guard(mutex); auto it = map.find(params); if (it == map.end()) { return false; @@ -110,7 +110,7 @@ struct AlgoPerfCache { } void Insert(const ConvParams& params, const T_Perf& algo_perf) { - std::lock_guard guard(mutex); + std::lock_guard guard(mutex); map[params] = algo_perf; } }; diff --git a/tools/ci_build/build.py b/tools/ci_build/build.py index 34859005c0c65..85eb3ddad3c56 100644 --- a/tools/ci_build/build.py +++ b/tools/ci_build/build.py @@ -1553,11 +1553,7 @@ def generate_build_tree( and not args.build_wasm ): if is_windows(): - # DLL initialization errors due to old conda msvcp140.dll dll are a result of the new MSVC compiler - # See https://developercommunity.visualstudio.com/t/Access-violation-with-std::mutex::lock-a/10664660#T-N10668856 - # Remove this definition (_DISABLE_CONSTEXPR_MUTEX_CONSTRUCTOR) - # once the conda msvcp140.dll dll is updated. - cflags += ["/guard:cf", "/DWIN32", "/D_WINDOWS", "/D_DISABLE_CONSTEXPR_MUTEX_CONSTRUCTOR"] + cflags += ["/guard:cf", "/DWIN32", "/D_WINDOWS"] if not args.use_gdk: # Target Windows 10 cflags += [ diff --git a/tools/ci_build/github/azure-pipelines/bigmodels-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/bigmodels-ci-pipeline.yml index a7ea5061e604e..3ee4375329069 100644 --- a/tools/ci_build/github/azure-pipelines/bigmodels-ci-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/bigmodels-ci-pipeline.yml @@ -42,7 +42,7 @@ parameters: variables: - template: templates/common-variables.yml - name: docker_base_image - value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11:20241015.1 + value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11:20241020.1 - name: linux_trt_version value: 10.3.0.26-1.cuda11.8 - name: Repository @@ -200,11 +200,15 @@ stages: nvcr.io/nvidia/pytorch:22.11-py3 \ bash -c ' \ set -ex; \ + pip uninstall -y $(pip list --format=freeze | grep opencv); \ + rm -rf /usr/local/lib/python3.8/dist-packages/cv2/; \ + apt-get update; \ + DEBIAN_FRONTEND="noninteractive" apt-get install --yes python3-opencv; \ python3 --version; \ python3 -m pip install --upgrade pip; \ python3 -m pip install /Release/*.whl; \ pushd /workspace/onnxruntime/python/tools/transformers/models/stable_diffusion; \ - python3 -m pip install -r requirements-cuda11.txt; \ + python3 -m pip install -r requirements/cuda11/requirements.txt; \ python3 -m pip install --upgrade polygraphy onnx-graphsurgeon ; \ echo Generate an image guided by a text prompt; \ python3 demo_txt2img.py --framework-model-dir /model_cache --seed 1 --deterministic "astronaut riding a horse on mars" ; \ diff --git a/tools/ci_build/github/azure-pipelines/linux-gpu-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-gpu-ci-pipeline.yml index 1f9b506ac451f..b0f40429c1a1e 100644 --- a/tools/ci_build/github/azure-pipelines/linux-gpu-ci-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/linux-gpu-ci-pipeline.yml @@ -49,9 +49,9 @@ parameters: variables: - name: docker_base_image ${{ if eq(parameters.CudaVersion, '11.8') }}: - value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11:20241015.1 + value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11:20241020.1 ${{ if eq(parameters.CudaVersion, '12.2') }}: - value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_ubi8_gcc12:20241015.1 + value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_ubi8_gcc12:20241020.1 - name: Repository ${{ if eq(parameters.CudaVersion, '11.8') }}: diff --git a/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-ci-pipeline.yml index e43cbd3413f2d..87d5c7bd824d2 100644 --- a/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-ci-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-ci-pipeline.yml @@ -39,9 +39,9 @@ parameters: variables: - name: docker_base_image ${{ if eq(parameters.CudaVersion, '11.8') }}: - value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11:20241015.1 + value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11:20241020.1 ${{ if eq(parameters.CudaVersion, '12.2') }}: - value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_ubi8_gcc12:20241015.1 + value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_ubi8_gcc12:20241020.1 - name: linux_trt_version ${{ if eq(parameters.CudaVersion, '11.8') }}: value: 10.4.0.26-1.cuda11.8 diff --git a/tools/ci_build/github/azure-pipelines/nuget/templates/test_win.yml b/tools/ci_build/github/azure-pipelines/nuget/templates/test_win.yml index ddcea447adc94..4842fcbd4dcfb 100644 --- a/tools/ci_build/github/azure-pipelines/nuget/templates/test_win.yml +++ b/tools/ci_build/github/azure-pipelines/nuget/templates/test_win.yml @@ -40,7 +40,7 @@ stages: steps: - task: UsePythonVersion@0 inputs: - versionSpec: '3.8' + versionSpec: '3.12' addToPath: true architecture: x64 diff --git a/tools/ci_build/github/azure-pipelines/post-merge-jobs.yml b/tools/ci_build/github/azure-pipelines/post-merge-jobs.yml index 833e97b437c33..7f131590c900b 100644 --- a/tools/ci_build/github/azure-pipelines/post-merge-jobs.yml +++ b/tools/ci_build/github/azure-pipelines/post-merge-jobs.yml @@ -377,7 +377,7 @@ stages: - task: UsePythonVersion@0 inputs: - versionSpec: '3.8' + versionSpec: '3.12' addToPath: true architecture: x64 @@ -411,7 +411,7 @@ stages: steps: - task: UsePythonVersion@0 inputs: - versionSpec: "3.9" + versionSpec: "3.12" addToPath: true architecture: "x64" @@ -447,7 +447,7 @@ stages: steps: - task: UsePythonVersion@0 inputs: - versionSpec: "3.9" + versionSpec: "3.12" addToPath: true architecture: "x64" diff --git a/tools/ci_build/github/azure-pipelines/py-cuda-package-test-pipeline.yml b/tools/ci_build/github/azure-pipelines/py-cuda-package-test-pipeline.yml index 7fb4563a477fc..e946fedd07a27 100644 --- a/tools/ci_build/github/azure-pipelines/py-cuda-package-test-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/py-cuda-package-test-pipeline.yml @@ -18,7 +18,7 @@ stages: machine_pool: 'Onnxruntime-Linux-GPU' python_wheel_suffix: '_gpu' timeout: 480 - docker_base_image: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_ubi8_gcc12:20241015.1 + docker_base_image: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_ubi8_gcc12:20241020.1 trt_version: '10.4.0.26-1.cuda12.6' cuda_version: '12.2' diff --git a/tools/ci_build/github/azure-pipelines/py-package-test-pipeline.yml b/tools/ci_build/github/azure-pipelines/py-package-test-pipeline.yml index 2641ec6d56ffb..c458f0cf4bfe2 100644 --- a/tools/ci_build/github/azure-pipelines/py-package-test-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/py-package-test-pipeline.yml @@ -54,7 +54,7 @@ stages: machine_pool: 'Onnxruntime-Linux-GPU' python_wheel_suffix: '_gpu' timeout: 480 - docker_base_image: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11:20241015.1 + docker_base_image: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11:20241020.1 trt_version: '10.4.0.26-1.cuda11.8' cuda_version: '11.8' diff --git a/tools/ci_build/github/azure-pipelines/rocm-nuget-packaging-pipeline.yml b/tools/ci_build/github/azure-pipelines/rocm-nuget-packaging-pipeline.yml index 9e1387ac47c97..471e911843aed 100644 --- a/tools/ci_build/github/azure-pipelines/rocm-nuget-packaging-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/rocm-nuget-packaging-pipeline.yml @@ -255,7 +255,7 @@ stages: - task: UsePythonVersion@0 displayName: 'Use Python' inputs: - versionSpec: 3.8 + versionSpec: 3.12 - task: MSBuild@1 displayName: 'Build Nuget Packages' diff --git a/tools/ci_build/github/azure-pipelines/stages/java-cuda-packaging-stage.yml b/tools/ci_build/github/azure-pipelines/stages/java-cuda-packaging-stage.yml index 87fe920d8ecdd..a38486995478d 100644 --- a/tools/ci_build/github/azure-pipelines/stages/java-cuda-packaging-stage.yml +++ b/tools/ci_build/github/azure-pipelines/stages/java-cuda-packaging-stage.yml @@ -148,9 +148,9 @@ stages: value: false - name: docker_base_image ${{ if eq(parameters.CudaVersion, '11.8') }}: - value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11:20241015.1 + value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11:20241020.1 ${{ if eq(parameters.CudaVersion, '12.2') }}: - value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_ubi8_gcc12:20241015.1 + value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_ubi8_gcc12:20241020.1 timeoutInMinutes: 60 steps: diff --git a/tools/ci_build/github/azure-pipelines/stages/jobs/py-linux-cuda-package-test-job.yml b/tools/ci_build/github/azure-pipelines/stages/jobs/py-linux-cuda-package-test-job.yml index 8c492c0153964..9289935b4ef9c 100644 --- a/tools/ci_build/github/azure-pipelines/stages/jobs/py-linux-cuda-package-test-job.yml +++ b/tools/ci_build/github/azure-pipelines/stages/jobs/py-linux-cuda-package-test-job.yml @@ -46,7 +46,7 @@ jobs: ${{ if eq(parameters.CudaVersion, '11.8') }}: value: nvidia/cuda:11.8.0-cudnn8-devel-ubi8 ${{ if eq(parameters.CudaVersion, '12.2') }}: - value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_ubi8_gcc12:20241015.1 + value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_ubi8_gcc12:20241020.1 - name: linux_trt_version ${{ if eq(parameters.CudaVersion, '11.8') }}: value: 10.4.0.26-1.cuda11.8 diff --git a/tools/ci_build/github/azure-pipelines/stages/nuget-cuda-packaging-stage.yml b/tools/ci_build/github/azure-pipelines/stages/nuget-cuda-packaging-stage.yml index b8ade5d36f5a1..7133031c84f49 100644 --- a/tools/ci_build/github/azure-pipelines/stages/nuget-cuda-packaging-stage.yml +++ b/tools/ci_build/github/azure-pipelines/stages/nuget-cuda-packaging-stage.yml @@ -135,7 +135,7 @@ stages: - task: UsePythonVersion@0 displayName: 'Use Python' inputs: - versionSpec: 3.8 + versionSpec: 3.12 - task: MSBuild@1 displayName: 'Build Nuget Packages' diff --git a/tools/ci_build/github/azure-pipelines/stages/py-cuda-packaging-stage.yml b/tools/ci_build/github/azure-pipelines/stages/py-cuda-packaging-stage.yml index 466dbb2f21ec8..ae18687cb9e54 100644 --- a/tools/ci_build/github/azure-pipelines/stages/py-cuda-packaging-stage.yml +++ b/tools/ci_build/github/azure-pipelines/stages/py-cuda-packaging-stage.yml @@ -77,8 +77,8 @@ stages: cmake_build_type: ${{ parameters.cmake_build_type }} cuda_version: ${{ parameters.cuda_version }} ${{ if eq(parameters.cuda_version, '11.8') }}: - docker_base_image: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11:20241015.1 + docker_base_image: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11:20241020.1 trt_version: 10.4.0.26-1.cuda11.8 ${{ if eq(parameters.cuda_version, '12.2') }}: - docker_base_image: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_ubi8_gcc12:20241015.1 + docker_base_image: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_ubi8_gcc12:20241020.1 trt_version: 10.4.0.26-1.cuda12.6 diff --git a/tools/ci_build/github/azure-pipelines/templates/c-api-cpu.yml b/tools/ci_build/github/azure-pipelines/templates/c-api-cpu.yml index e933e1e70ff76..a98efa8f3fc92 100644 --- a/tools/ci_build/github/azure-pipelines/templates/c-api-cpu.yml +++ b/tools/ci_build/github/azure-pipelines/templates/c-api-cpu.yml @@ -446,7 +446,7 @@ stages: - task: UsePythonVersion@0 displayName: 'Use Python' inputs: - versionSpec: 3.8 + versionSpec: 3.12 - task: MSBuild@1 displayName: 'Build Nuget Packages' diff --git a/tools/ci_build/github/azure-pipelines/templates/linux-wasm-ci.yml b/tools/ci_build/github/azure-pipelines/templates/linux-wasm-ci.yml index 2ab432e94fcbd..41ba5c3868f5e 100644 --- a/tools/ci_build/github/azure-pipelines/templates/linux-wasm-ci.yml +++ b/tools/ci_build/github/azure-pipelines/templates/linux-wasm-ci.yml @@ -73,7 +73,7 @@ jobs: displayName: 'Checkout submodules' - task: UsePythonVersion@0 inputs: - versionSpec: '3.8' + versionSpec: '3.12' addToPath: true architecture: $(buildArch) - template: download-deps.yml diff --git a/tools/ci_build/github/azure-pipelines/templates/ondevice-training-cpu-packaging-pipeline.yml b/tools/ci_build/github/azure-pipelines/templates/ondevice-training-cpu-packaging-pipeline.yml index 5cfa135135dca..90055cbbc6c3e 100644 --- a/tools/ci_build/github/azure-pipelines/templates/ondevice-training-cpu-packaging-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/templates/ondevice-training-cpu-packaging-pipeline.yml @@ -233,7 +233,7 @@ stages: - task: UsePythonVersion@0 displayName: 'Use Python' inputs: - versionSpec: 3.8 + versionSpec: 3.12 - task: MSBuild@1 displayName: 'Build Nuget Packages' diff --git a/tools/ci_build/github/azure-pipelines/templates/publish-nuget-steps.yml b/tools/ci_build/github/azure-pipelines/templates/publish-nuget-steps.yml index 8639a5ca0a55d..6e13db553629e 100644 --- a/tools/ci_build/github/azure-pipelines/templates/publish-nuget-steps.yml +++ b/tools/ci_build/github/azure-pipelines/templates/publish-nuget-steps.yml @@ -34,7 +34,7 @@ stages: - task: UsePythonVersion@0 inputs: - versionSpec: '3.9' + versionSpec: '3.12' addToPath: true - template: set-version-number-variables-step.yml diff --git a/tools/ci_build/github/azure-pipelines/templates/py-packaging-stage.yml b/tools/ci_build/github/azure-pipelines/templates/py-packaging-stage.yml index 6a131dc909a47..10d7ce04747d9 100644 --- a/tools/ci_build/github/azure-pipelines/templates/py-packaging-stage.yml +++ b/tools/ci_build/github/azure-pipelines/templates/py-packaging-stage.yml @@ -470,7 +470,7 @@ stages: parameters: arch: 'x86_64' machine_pool: 'onnxruntime-Ubuntu2204-AMD-CPU-Large' - docker_base_image: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11:20241015.1 + docker_base_image: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11:20241020.1 extra_build_arg: ${{ parameters.build_py_parameters }} cmake_build_type: ${{ parameters.cmake_build_type }} trt_version: '10.4.0.26-1.cuda11.8' diff --git a/tools/ci_build/github/azure-pipelines/templates/qnn-ep-win.yml b/tools/ci_build/github/azure-pipelines/templates/qnn-ep-win.yml index 30280c6e22c7e..7ec84453321ef 100644 --- a/tools/ci_build/github/azure-pipelines/templates/qnn-ep-win.yml +++ b/tools/ci_build/github/azure-pipelines/templates/qnn-ep-win.yml @@ -1,6 +1,6 @@ parameters: QnnSdk: '2.27.0.240926' - build_config: 'RelWithDebInfo' + build_config: 'RelWithDebInfo' IsReleaseBuild: false DoEsrp: false qnn_ep_build_pool_name: 'Onnxruntime-QNNEP-Windows-2022-CPU' @@ -32,9 +32,9 @@ stages: - task: UsePythonVersion@0 inputs: - versionSpec: '3.8' + versionSpec: '3.12' addToPath: true - + - template: jobs/download_win_qnn_sdk.yml parameters: QnnSDKVersion: ${{ parameters.QnnSdk }} @@ -44,7 +44,7 @@ stages: inputs: scriptPath: '$(Build.SourcesDirectory)\tools\ci_build\build.py' arguments: '--use_qnn --qnn_home $(QnnSDKRootDir) $(commonBuildArgs)' - + - task: VSBuild@1 displayName: 'Build onnxruntime' inputs: diff --git a/tools/ci_build/github/azure-pipelines/templates/react-native-ci.yml b/tools/ci_build/github/azure-pipelines/templates/react-native-ci.yml index 8593aa2d821fa..ea3ec00e68f73 100644 --- a/tools/ci_build/github/azure-pipelines/templates/react-native-ci.yml +++ b/tools/ci_build/github/azure-pipelines/templates/react-native-ci.yml @@ -23,7 +23,7 @@ parameters: displayName: 'Stage that the initial stage of react-native-ci depends on' type: string default: '' - + - name: enable_code_sign displayName: Use GPG to sign the jars type: boolean @@ -58,9 +58,9 @@ stages: steps: - template: use-xcode-version.yml - task: UsePythonVersion@0 - displayName: Use python 3.9 + displayName: Use python 3.12 inputs: - versionSpec: "3.9" + versionSpec: "3.12" addToPath: true architecture: "x64" @@ -113,9 +113,9 @@ stages: condition: always() - template: use-xcode-version.yml - task: UsePythonVersion@0 - displayName: Use python 3.9 + displayName: Use python 3.12 inputs: - versionSpec: "3.9" + versionSpec: "3.12" addToPath: true architecture: "x64" diff --git a/tools/ci_build/github/azure-pipelines/templates/stages/mac-ios-packaging-build-stage.yml b/tools/ci_build/github/azure-pipelines/templates/stages/mac-ios-packaging-build-stage.yml index 0d2330489279d..a3b6bc1025267 100644 --- a/tools/ci_build/github/azure-pipelines/templates/stages/mac-ios-packaging-build-stage.yml +++ b/tools/ci_build/github/azure-pipelines/templates/stages/mac-ios-packaging-build-stage.yml @@ -62,7 +62,7 @@ stages: - task: UsePythonVersion@0 inputs: - versionSpec: "3.9" + versionSpec: "3.12" addToPath: true architecture: "x64" diff --git a/tools/ci_build/github/azure-pipelines/templates/validate-package.yml b/tools/ci_build/github/azure-pipelines/templates/validate-package.yml index 5014b315a4083..529cca4586ef6 100644 --- a/tools/ci_build/github/azure-pipelines/templates/validate-package.yml +++ b/tools/ci_build/github/azure-pipelines/templates/validate-package.yml @@ -11,11 +11,11 @@ steps: - task: UsePythonVersion@0 displayName: 'Use Python' inputs: - versionSpec: 3.8 + versionSpec: 3.12 - task: PythonScript@0 displayName: 'Validate Package' inputs: scriptPath: '${{parameters.ScriptPath}}' arguments: '--package_type ${{parameters.PackageType}} --package_name ${{parameters.PackageName}} --package_path ${{parameters.PackagePath}} --platforms_supported ${{parameters.PlatformsSupported}} --verify_nuget_signing ${{parameters.VerifyNugetSigning}}' - workingDirectory: ${{parameters.workingDirectory}} + workingDirectory: ${{parameters.workingDirectory}} diff --git a/tools/ci_build/github/azure-pipelines/templates/win-ci.yml b/tools/ci_build/github/azure-pipelines/templates/win-ci.yml index 2cb7f94470d74..27c97bee23c5d 100644 --- a/tools/ci_build/github/azure-pipelines/templates/win-ci.yml +++ b/tools/ci_build/github/azure-pipelines/templates/win-ci.yml @@ -360,7 +360,7 @@ stages: - task: UsePythonVersion@0 inputs: - versionSpec: '3.8' + versionSpec: '3.12' addToPath: true architecture: ${{ parameters.buildArch }} @@ -397,4 +397,4 @@ stages: parameters: msbuildPlatform: ${{ parameters.msbuildPlatform }} java_artifact_id: ${{ parameters.java_artifact_id }} - buildOnly: false \ No newline at end of file + buildOnly: false diff --git a/tools/ci_build/github/azure-pipelines/templates/win-wasm-ci.yml b/tools/ci_build/github/azure-pipelines/templates/win-wasm-ci.yml index 64e7b6dbb4455..5c18d075fc425 100644 --- a/tools/ci_build/github/azure-pipelines/templates/win-wasm-ci.yml +++ b/tools/ci_build/github/azure-pipelines/templates/win-wasm-ci.yml @@ -76,7 +76,7 @@ jobs: displayName: 'Checkout submodules' - task: UsePythonVersion@0 inputs: - versionSpec: '3.8' + versionSpec: '3.12' addToPath: true architecture: $(buildArch) - task: NodeTool@0 diff --git a/tools/ci_build/github/azure-pipelines/win-qnn-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/win-qnn-ci-pipeline.yml index fdb6998f53d15..f55f476f70d30 100644 --- a/tools/ci_build/github/azure-pipelines/win-qnn-ci-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/win-qnn-ci-pipeline.yml @@ -54,7 +54,7 @@ jobs: - task: UsePythonVersion@0 inputs: - versionSpec: '3.8' + versionSpec: '3.12' addToPath: true architecture: $(buildArch) diff --git a/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cpu b/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cpu index 0b39bea26c7de..3ff213b16f3d1 100644 --- a/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cpu +++ b/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cpu @@ -1,4 +1,4 @@ -FROM onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cpu_x64_ubi8_gcc12:20241015.1 +FROM onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cpu_x64_ubi8_gcc12:20241020.1 ENV JAVA_HOME=/usr/lib/jvm/msopenjdk-11 diff --git a/tools/ci_build/github/linux/docker/inference/aarch64/default/cpu/scripts/install_deps.sh b/tools/ci_build/github/linux/docker/inference/aarch64/default/cpu/scripts/install_deps.sh index e6f38b5cbb76e..bf08a853fe7f4 100755 --- a/tools/ci_build/github/linux/docker/inference/aarch64/default/cpu/scripts/install_deps.sh +++ b/tools/ci_build/github/linux/docker/inference/aarch64/default/cpu/scripts/install_deps.sh @@ -40,7 +40,7 @@ cd /tmp/src CPU_ARCH=$(uname -m) echo "Installing cmake" -GetFile "https://github.com/Kitware/CMake/releases/download/v3.31.0-rc1/cmake-3.31.0-rc1-linux-$CPU_ARCH.tar.gz" "/tmp/src/cmake.tar.gz" +GetFile "https://github.com/Kitware/CMake/releases/download/v3.31.0-rc2/cmake-3.31.0-rc2-linux-$CPU_ARCH.tar.gz" "/tmp/src/cmake.tar.gz" tar -zxf /tmp/src/cmake.tar.gz --strip=1 -C /usr echo "Installing Ninja" diff --git a/tools/ci_build/github/linux/docker/inference/aarch64/python/cpu/Dockerfile b/tools/ci_build/github/linux/docker/inference/aarch64/python/cpu/Dockerfile index 933b56e4fd413..3f42b28497c7a 100644 --- a/tools/ci_build/github/linux/docker/inference/aarch64/python/cpu/Dockerfile +++ b/tools/ci_build/github/linux/docker/inference/aarch64/python/cpu/Dockerfile @@ -1,4 +1,4 @@ -FROM onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cpu_aarch64_ubi8_gcc12:20241015.1 +FROM onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cpu_aarch64_ubi8_gcc12:20241020.1 ADD scripts /tmp/scripts RUN cd /tmp/scripts && /tmp/scripts/install_centos.sh && /tmp/scripts/install_deps.sh && rm -rf /tmp/scripts diff --git a/tools/ci_build/github/linux/docker/inference/x86_64/default/cpu/scripts/install_deps.sh b/tools/ci_build/github/linux/docker/inference/x86_64/default/cpu/scripts/install_deps.sh index 53a49a996ad2d..0cc48a720b8f4 100755 --- a/tools/ci_build/github/linux/docker/inference/x86_64/default/cpu/scripts/install_deps.sh +++ b/tools/ci_build/github/linux/docker/inference/x86_64/default/cpu/scripts/install_deps.sh @@ -39,7 +39,7 @@ mkdir -p /tmp/src cd /tmp/src CPU_ARCH=$(uname -m) echo "Installing cmake" -GetFile "https://github.com/Kitware/CMake/releases/download/v3.31.0-rc1/cmake-3.31.0-rc1-linux-$CPU_ARCH.tar.gz" "/tmp/src/cmake.tar.gz" +GetFile "https://github.com/Kitware/CMake/releases/download/v3.31.0-rc2/cmake-3.31.0-rc2-linux-$CPU_ARCH.tar.gz" "/tmp/src/cmake.tar.gz" tar -zxf /tmp/src/cmake.tar.gz --strip=1 -C /usr echo "Installing Ninja" diff --git a/tools/ci_build/github/linux/docker/inference/x86_64/default/cuda11/Dockerfile b/tools/ci_build/github/linux/docker/inference/x86_64/default/cuda11/Dockerfile index 238f0c9a0d922..6702474d75801 100644 --- a/tools/ci_build/github/linux/docker/inference/x86_64/default/cuda11/Dockerfile +++ b/tools/ci_build/github/linux/docker/inference/x86_64/default/cuda11/Dockerfile @@ -2,7 +2,7 @@ # Licensed under the MIT License. # This file is used by Zip-Nuget Packaging NoContribOps Pipeline,Zip-Nuget-Java Packaging Pipeline -FROM onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11_dotnet:20241015.1 +FROM onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11_dotnet:20241020.1 ARG TRT_VERSION RUN rpm -Uvh https://packages.microsoft.com/config/centos/8/packages-microsoft-prod.rpm && dnf install -y msopenjdk-11 diff --git a/tools/ci_build/github/linux/docker/inference/x86_64/default/cuda12/Dockerfile b/tools/ci_build/github/linux/docker/inference/x86_64/default/cuda12/Dockerfile index 24a4503c03f4c..4059de23b2480 100644 --- a/tools/ci_build/github/linux/docker/inference/x86_64/default/cuda12/Dockerfile +++ b/tools/ci_build/github/linux/docker/inference/x86_64/default/cuda12/Dockerfile @@ -2,7 +2,7 @@ # Licensed under the MIT License. # This file is used by Zip-Nuget Packaging NoContribOps Pipeline,Zip-Nuget-Java Packaging Pipeline -FROM onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_ubi8_gcc12_dotnet:20241015.1 +FROM onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_ubi8_gcc12_dotnet:20241020.1 ARG TRT_VERSION #Install TensorRT only if TRT_VERSION is not empty diff --git a/tools/ci_build/github/linux/docker/inference/x86_64/python/cpu/Dockerfile b/tools/ci_build/github/linux/docker/inference/x86_64/python/cpu/Dockerfile index deea9db9aae91..76b31e71a7dea 100644 --- a/tools/ci_build/github/linux/docker/inference/x86_64/python/cpu/Dockerfile +++ b/tools/ci_build/github/linux/docker/inference/x86_64/python/cpu/Dockerfile @@ -1,4 +1,4 @@ -FROM onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cpu_x64_ubi8_gcc12:20241015.1 +FROM onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cpu_x64_ubi8_gcc12:20241020.1 ADD scripts /tmp/scripts RUN cd /tmp/scripts && /tmp/scripts/install_centos.sh && rm -rf /tmp/scripts