diff --git a/cgmanifests/generated/cgmanifest.json b/cgmanifests/generated/cgmanifest.json index d728ae797429e..78db7d735dad9 100644 --- a/cgmanifests/generated/cgmanifest.json +++ b/cgmanifests/generated/cgmanifest.json @@ -26,7 +26,7 @@ "component": { "type": "git", "git": { - "commitHash": "990217f043af7222348ca8f0301e17fa7b841781", + "commitHash": "595228d99e3977ac27cb79d5963adda262af99ad", "repositoryUrl": "https://github.com/onnx/onnx.git" }, "comments": "git submodule at cmake/external/onnx" @@ -216,7 +216,7 @@ "component": { "type": "git", "git": { - "commitHash": "eb43908b02a296ea0594432f06e9d3fac288d672", + "commitHash": "06adf4461ac84035bee658c6cf5df39f7ab6071d", "repositoryUrl": "https://github.com/onnx/onnx-tensorrt.git" }, "comments": "onnx_tensorrt" diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt index 974bccd10f360..5c35ef93054df 100644 --- a/cmake/CMakeLists.txt +++ b/cmake/CMakeLists.txt @@ -59,8 +59,8 @@ if (NOT CMAKE_BUILD_TYPE) set(CMAKE_BUILD_TYPE "RelWithDebInfo" CACHE STRING "Choose build type: Debug Release RelWithDebInfo MinSizeRel." FORCE) endif() -if("${CMAKE_C_COMPILER_ID}" STREQUAL "GNU" AND CMAKE_C_COMPILER_VERSION VERSION_LESS 8) - message(FATAL_ERROR "GCC version must be greater than or equal to 8") +if("${CMAKE_C_COMPILER_ID}" STREQUAL "GNU" AND CMAKE_C_COMPILER_VERSION VERSION_LESS 9) + message(FATAL_ERROR "GCC version must be greater than or equal to 9") endif() # Options @@ -1300,12 +1300,6 @@ if (onnxruntime_USE_TVM) list(APPEND onnxruntime_EXTERNAL_DEPENDENCIES tvm) endif() -# needs to link with stdc++fs in Linux -if (UNIX AND "${CMAKE_C_COMPILER_ID}" STREQUAL "GNU" AND CMAKE_C_COMPILER_VERSION VERSION_LESS 9) - set(FS_STDLIB stdc++fs) -endif() -list(APPEND onnxruntime_EXTERNAL_LIBRARIES ${FS_STDLIB}) - # onnxruntime-extensions if (onnxruntime_USE_EXTENSIONS) include(extensions) @@ -1474,16 +1468,6 @@ if (onnxruntime_USE_CUDA) endif() endif() -if (onnxruntime_USE_TENSORRT) - # needs to link with stdc++fs in Linux - if (UNIX) - if (NOT APPLE) - set(FS_STDLIB stdc++fs) - endif() - endif() - list(APPEND onnxruntime_EXTERNAL_LIBRARIES ${FS_STDLIB}) -endif() - if (onnxruntime_USE_MIGRAPHX) if (WIN32) message(FATAL_ERROR "MIGraphX does not support build in Windows!") diff --git a/cmake/deps.txt b/cmake/deps.txt index d4d19dea08c8b..88c1881ad82fb 100644 --- a/cmake/deps.txt +++ b/cmake/deps.txt @@ -36,7 +36,7 @@ microsoft_wil;https://github.com/microsoft/wil/archive/refs/tags/v1.0.230629.1.z mimalloc;https://github.com/microsoft/mimalloc/archive/refs/tags/v2.1.1.zip;d5ee7d34223d0567892db5179849939c8769dc41 mp11;https://github.com/boostorg/mp11/archive/refs/tags/boost-1.82.0.zip;9bc9e01dffb64d9e0773b2e44d2f22c51aace063 neural_speed;https://github.com/intel/neural-speed/archive/refs/tags/v0.3.zip;5ec64e3071edc7347ebd8a81679cf06e2bb9b851 -onnx;https://github.com/onnx/onnx/archive/refs/tags/v1.16.0.zip;a6d8b619459fb4657f8bec7d1c6d95ad6d4c069d +onnx;https://github.com/onnx/onnx/archive/refs/tags/v1.16.1.zip;2eb9198bb352757d5ff13977cbe0634898e0837c #use the latest commit of 10.0-GA onnx_tensorrt;https://github.com/onnx/onnx-tensorrt/archive/06adf4461ac84035bee658c6cf5df39f7ab6071d.zip;46dceef659d75d276e7914a8057c2282269d5e7b protobuf;https://github.com/protocolbuffers/protobuf/archive/refs/tags/v21.12.zip;7cf2733949036c7d52fda017badcab093fe73bfa diff --git a/cmake/external/onnx b/cmake/external/onnx index 990217f043af7..595228d99e397 160000 --- a/cmake/external/onnx +++ b/cmake/external/onnx @@ -1 +1 @@ -Subproject commit 990217f043af7222348ca8f0301e17fa7b841781 +Subproject commit 595228d99e3977ac27cb79d5963adda262af99ad diff --git a/cmake/onnxruntime_providers_migraphx.cmake b/cmake/onnxruntime_providers_migraphx.cmake index 91ac66a40721d..01c4f8b2c8719 100644 --- a/cmake/onnxruntime_providers_migraphx.cmake +++ b/cmake/onnxruntime_providers_migraphx.cmake @@ -49,7 +49,7 @@ target_compile_options(onnxruntime_providers_migraphx PRIVATE -Wno-error=sign-compare) set_property(TARGET onnxruntime_providers_migraphx APPEND_STRING PROPERTY COMPILE_FLAGS "-Wno-deprecated-declarations") set_property(TARGET onnxruntime_providers_migraphx APPEND_STRING PROPERTY LINK_FLAGS "-Xlinker --version-script=${ONNXRUNTIME_ROOT}/core/providers/migraphx/version_script.lds -Xlinker --gc-sections") - target_link_libraries(onnxruntime_providers_migraphx PRIVATE nsync::nsync_cpp stdc++fs) + target_link_libraries(onnxruntime_providers_migraphx PRIVATE nsync::nsync_cpp) include(CheckLibraryExists) check_library_exists(migraphx::c "migraphx_program_run_async" "/opt/rocm/migraphx/lib" HAS_STREAM_SYNC) diff --git a/cmake/onnxruntime_providers_tensorrt.cmake b/cmake/onnxruntime_providers_tensorrt.cmake index 1e8f388548faf..e56de0c7124dc 100644 --- a/cmake/onnxruntime_providers_tensorrt.cmake +++ b/cmake/onnxruntime_providers_tensorrt.cmake @@ -206,7 +206,7 @@ elseif(UNIX) set_property(TARGET onnxruntime_providers_tensorrt APPEND_STRING PROPERTY COMPILE_FLAGS "-Wno-deprecated-declarations") set_property(TARGET onnxruntime_providers_tensorrt APPEND_STRING PROPERTY LINK_FLAGS "-Xlinker --version-script=${ONNXRUNTIME_ROOT}/core/providers/tensorrt/version_script.lds -Xlinker --gc-sections") - target_link_libraries(onnxruntime_providers_tensorrt PRIVATE nsync::nsync_cpp stdc++fs) + target_link_libraries(onnxruntime_providers_tensorrt PRIVATE nsync::nsync_cpp) elseif(WIN32) set_property(TARGET onnxruntime_providers_tensorrt APPEND_STRING PROPERTY LINK_FLAGS "-DEF:${ONNXRUNTIME_ROOT}/core/providers/tensorrt/symbols.def") else() diff --git a/cmake/patches/onnx/onnx.patch b/cmake/patches/onnx/onnx.patch index fe8d6622bcc0e..162d33581a5ca 100644 --- a/cmake/patches/onnx/onnx.patch +++ b/cmake/patches/onnx/onnx.patch @@ -36,15 +36,15 @@ index b847798e..a6c31904 100644 --- a/onnx/common/file_utils.h +++ b/onnx/common/file_utils.h @@ -6,7 +6,6 @@ - + #pragma once - + -#include #include #include - + @@ -17,8 +16,7 @@ namespace ONNX_NAMESPACE { - + template void LoadProtoFromPath(const std::string proto_path, T& proto) { - std::filesystem::path proto_u8_path = std::filesystem::u8path(proto_path); @@ -53,42 +53,6 @@ index b847798e..a6c31904 100644 if (!proto_stream.good()) { fail_check("Unable to open proto file: ", proto_path, ". Please check if it is a valid proto. "); } -diff --git a/onnx/defs/quantization/defs.cc b/onnx/defs/quantization/defs.cc -index 70b4a4db..98c11545 100644 ---- a/onnx/defs/quantization/defs.cc -+++ b/onnx/defs/quantization/defs.cc -@@ -200,6 +200,9 @@ ONNX_OPERATOR_SET_SCHEMA( - .SetDoc(DequantizeLinear_ver21_doc) - .TypeAndShapeInferenceFunction([](ONNX_NAMESPACE::InferenceContext& ctx) { - propagateElemTypeFromInputToOutput(ctx, 1, 0); -+ if (!hasInputShape(ctx, 0)) { -+ return; -+ } - auto& input_shape = getInputShape(ctx, 0); - updateOutputShape(ctx, 0, input_shape); - })); -diff --git a/onnx/defs/quantization/old.cc b/onnx/defs/quantization/old.cc -index 3f2d6384..d2f7cfd8 100644 ---- a/onnx/defs/quantization/old.cc -+++ b/onnx/defs/quantization/old.cc -@@ -130,6 +130,9 @@ ONNX_OPERATOR_SET_SCHEMA( - .SetDoc(DequantizeLinear_ver19_doc) - .TypeAndShapeInferenceFunction([](ONNX_NAMESPACE::InferenceContext& ctx) { - propagateElemTypeFromInputToOutput(ctx, 1, 0); -+ if (!hasInputShape(ctx, 0)) { -+ return; -+ } - auto& input_shape = getInputShape(ctx, 0); - updateOutputShape(ctx, 0, input_shape); - })); -@@ -181,7 +184,6 @@ ONNX_OPERATOR_SET_SCHEMA( - if (!hasInputShape(ctx, 0)) { - return; - } -- - auto& input_shape = getInputShape(ctx, 0); - updateOutputShape(ctx, 0, input_shape); - })); diff --git a/onnx/onnx_pb.h b/onnx/onnx_pb.h index 0aab3e26..398ac2d6 100644 --- a/onnx/onnx_pb.h @@ -96,7 +60,7 @@ index 0aab3e26..398ac2d6 100644 @@ -47,10 +47,28 @@ #define ONNX_API ONNX_IMPORT #endif - + +#if defined(__GNUC__) +#pragma GCC diagnostic push + @@ -116,61 +80,9 @@ index 0aab3e26..398ac2d6 100644 #else #include "onnx/onnx.pb.h" #endif - + +#if defined(__GNUC__) +#pragma GCC diagnostic pop +#endif + #endif // ! ONNX_ONNX_PB_H -diff --git a/onnx/shape_inference/implementation.cc b/onnx/shape_inference/implementation.cc -index fab1faf2..8723dcd4 100644 ---- a/onnx/shape_inference/implementation.cc -+++ b/onnx/shape_inference/implementation.cc -@@ -488,29 +488,29 @@ class ShapeInferenceImplBase { - ProcessCall(n, *(iter->second), ctx); - } else { - has_unsupported_op = true; -+ return; - } - } else { - has_unsupported_op = true; -+ return; - } -- if (!has_unsupported_op) { -- for (int i = 0; i < n.output_size(); ++i) { -- // skip type and shape propagation for missing optional outputs. -- if (!n.output(i).empty()) -- UpdateType(n.output(i), ctx.getOutputType(i)); -- } -- // Constant values are tracked to improve inference/checking for subsequent nodes. -- ProcessConstant(n); -- // If data-propagation is enabled, partial-evaluation (aka data-propagation) is performed -- // to improve inference/checking for subsequent nodes. -- if (options.enable_data_propagation && schema && schema->has_data_propagation_function()) { -- if (generated_shape_data_by_name == nullptr) { -- fail_shape_inference( -- "Container for generated shape data cannot be nullptr when enable_data_propagation option is set."); -- } -- DataPropagationContextImpl data_propagation_ctx( -- n, value_types_by_name, input_data_by_name, *generated_shape_data_by_name); -- schema->GetDataPropagationFunction()(data_propagation_ctx); -+ for (int i = 0; i < n.output_size(); ++i) { -+ // skip type and shape propagation for missing optional outputs. -+ if (!n.output(i).empty()) -+ UpdateType(n.output(i), ctx.getOutputType(i)); -+ } -+ // Constant values are tracked to improve inference/checking for subsequent nodes. -+ ProcessConstant(n); -+ // If data-propagation is enabled, partial-evaluation (aka data-propagation) is performed -+ // to improve inference/checking for subsequent nodes. -+ if (options.enable_data_propagation && schema && schema->has_data_propagation_function()) { -+ if (generated_shape_data_by_name == nullptr) { -+ fail_shape_inference( -+ "Container for generated shape data cannot be nullptr when enable_data_propagation option is set."); - } -+ DataPropagationContextImpl data_propagation_ctx( -+ n, value_types_by_name, input_data_by_name, *generated_shape_data_by_name); -+ schema->GetDataPropagationFunction()(data_propagation_ctx); - } - } - ONNX_CATCH(const ONNX_NAMESPACE::InferenceError& ex) { diff --git a/docs/OperatorKernels.md b/docs/OperatorKernels.md index 8092c26da651a..67bfe48327e14 100644 --- a/docs/OperatorKernels.md +++ b/docs/OperatorKernels.md @@ -421,7 +421,7 @@ Do not modify directly.* |Transpose|*in* data:**T**
*out* transposed:**T**|21+|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(float8e4m3fn), tensor(float8e4m3fnuz), tensor(float8e5m2), tensor(float8e5m2fnuz), tensor(int16), tensor(int32), tensor(int4), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint4), tensor(uint64), tensor(uint8)| |||[13, 20]|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)| |||[1, 12]|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)| -|Trilu|*in* input:**T**
*in* k:**tensor(int64)**
*out* output:**T**|14+|**T** = tensor(double), tensor(float), tensor(int64)| +|Trilu|*in* input:**T**
*in* k:**tensor(int64)**
*out* output:**T**|14+|**T** = tensor(bool), tensor(double), tensor(float), tensor(int64)| |Unique|*in* X:**T**
*out* Y:**T**
*out* indices:**tensor(int64)**
*out* inverse_indices:**tensor(int64)**
*out* counts:**tensor(int64)**|11+|**T** = tensor(double), tensor(float), tensor(int64), tensor(int8), tensor(string)| |Unsqueeze|*in* data:**T**
*in* axes:**tensor(int64)**
*out* expanded:**T**

or

*in* data:**T**
*out* expanded:**T**|21+|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)| |||[13, 20]|**T** = tensor(bfloat16), tensor(bool), tensor(double), tensor(float), tensor(float16), tensor(int16), tensor(int32), tensor(int64), tensor(int8), tensor(string), tensor(uint16), tensor(uint32), tensor(uint64), tensor(uint8)| diff --git a/js/package-lock.json b/js/package-lock.json index 1f8a6a09039d3..548706ee286b7 100644 --- a/js/package-lock.json +++ b/js/package-lock.json @@ -27,6 +27,7 @@ "mocha": "^10.2.0", "npmlog": "^7.0.1", "prettier": "^3.0.3", + "terser": "^5.31.0", "typescript": "^5.2.2" } }, @@ -600,6 +601,64 @@ "integrity": "sha512-6EwiSjwWYP7pTckG6I5eyFANjPhmPjUX9JRLUSfNPC7FX7zK9gyZAfUEaECL6ALTpGX5AjnBq3C9XmVWPitNpw==", "dev": true }, + "node_modules/@jridgewell/gen-mapping": { + "version": "0.3.5", + "resolved": "https://registry.npmjs.org/@jridgewell/gen-mapping/-/gen-mapping-0.3.5.tgz", + "integrity": "sha512-IzL8ZoEDIBRWEzlCcRhOaCupYyN5gdIK+Q6fbFdPDg6HqX6jpkItn7DFIpW9LQzXG6Df9sA7+OKnq0qlz/GaQg==", + "dev": true, + "dependencies": { + "@jridgewell/set-array": "^1.2.1", + "@jridgewell/sourcemap-codec": "^1.4.10", + "@jridgewell/trace-mapping": "^0.3.24" + }, + "engines": { + "node": ">=6.0.0" + } + }, + "node_modules/@jridgewell/resolve-uri": { + "version": "3.1.2", + "resolved": "https://registry.npmjs.org/@jridgewell/resolve-uri/-/resolve-uri-3.1.2.tgz", + "integrity": "sha512-bRISgCIjP20/tbWSPWMEi54QVPRZExkuD9lJL+UIxUKtwVJA8wW1Trb1jMs1RFXo1CBTNZ/5hpC9QvmKWdopKw==", + "dev": true, + "engines": { + "node": ">=6.0.0" + } + }, + "node_modules/@jridgewell/set-array": { + "version": "1.2.1", + "resolved": "https://registry.npmjs.org/@jridgewell/set-array/-/set-array-1.2.1.tgz", + "integrity": "sha512-R8gLRTZeyp03ymzP/6Lil/28tGeGEzhx1q2k703KGWRAI1VdvPIXdG70VJc2pAMw3NA6JKL5hhFu1sJX0Mnn/A==", + "dev": true, + "engines": { + "node": ">=6.0.0" + } + }, + "node_modules/@jridgewell/source-map": { + "version": "0.3.6", + "resolved": "https://registry.npmjs.org/@jridgewell/source-map/-/source-map-0.3.6.tgz", + "integrity": "sha512-1ZJTZebgqllO79ue2bm3rIGud/bOe0pP5BjSRCRxxYkEZS8STV7zN84UBbiYu7jy+eCKSnVIUgoWWE/tt+shMQ==", + "dev": true, + "dependencies": { + "@jridgewell/gen-mapping": "^0.3.5", + "@jridgewell/trace-mapping": "^0.3.25" + } + }, + "node_modules/@jridgewell/sourcemap-codec": { + "version": "1.4.15", + "resolved": "https://registry.npmjs.org/@jridgewell/sourcemap-codec/-/sourcemap-codec-1.4.15.tgz", + "integrity": "sha512-eF2rxCRulEKXHTRiDrDy6erMYWqNw4LPdQ8UQA4huuxaQsVeRPFl2oM8oDGxMFhJUWZf9McpLtJasDDZb/Bpeg==", + "dev": true + }, + "node_modules/@jridgewell/trace-mapping": { + "version": "0.3.25", + "resolved": "https://registry.npmjs.org/@jridgewell/trace-mapping/-/trace-mapping-0.3.25.tgz", + "integrity": "sha512-vNk6aEwybGtawWmy/PzwnGDOjCkLWSD2wqvjGGAgOAwCGWySYXfYoxt00IJkTF+8Lb57DwOb3Aa0o9CApepiYQ==", + "dev": true, + "dependencies": { + "@jridgewell/resolve-uri": "^3.1.0", + "@jridgewell/sourcemap-codec": "^1.4.14" + } + }, "node_modules/@jspm/core": { "version": "2.0.1", "resolved": "https://registry.npmjs.org/@jspm/core/-/core-2.0.1.tgz", @@ -1288,6 +1347,12 @@ "ieee754": "^1.2.1" } }, + "node_modules/buffer-from": { + "version": "1.1.2", + "resolved": "https://registry.npmjs.org/buffer-from/-/buffer-from-1.1.2.tgz", + "integrity": "sha512-E+XQCRwSbaaiChtv6k6Dwgc+bx+Bs6vuKJHHl5kox/BaKbhiXzqQOwK4cO22yElGp2OCmjwVhT3HmxgyPGnJfQ==", + "dev": true + }, "node_modules/builtin-modules": { "version": "3.3.0", "resolved": "https://registry.npmjs.org/builtin-modules/-/builtin-modules-3.3.0.tgz", @@ -1479,6 +1544,12 @@ "color-support": "bin.js" } }, + "node_modules/commander": { + "version": "2.20.3", + "resolved": "https://registry.npmjs.org/commander/-/commander-2.20.3.tgz", + "integrity": "sha512-GpVkmM8vF2vQUkj2LvZmD35JxeJOLCwJ9cUkugyk2nuhbv3+mJvpLYYt+0+USMxE+oj+ey/lJEnhZw75x/OMcQ==", + "dev": true + }, "node_modules/comment-parser": { "version": "1.4.0", "resolved": "https://registry.npmjs.org/comment-parser/-/comment-parser-1.4.0.tgz", @@ -4172,6 +4243,25 @@ "node": ">=8" } }, + "node_modules/source-map": { + "version": "0.6.1", + "resolved": "https://registry.npmjs.org/source-map/-/source-map-0.6.1.tgz", + "integrity": "sha512-UjgapumWlbMhkBgzT7Ykc5YXUT46F0iKu8SGXq0bcwP5dz/h0Plj6enJqjz1Zbq2l5WaqYnrVbwWOWMyF3F47g==", + "dev": true, + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/source-map-support": { + "version": "0.5.21", + "resolved": "https://registry.npmjs.org/source-map-support/-/source-map-support-0.5.21.tgz", + "integrity": "sha512-uBHU3L3czsIyYXKX88fdrGovxdSCoTGDRZ6SYXtSRxLZUzHg5P/66Ht6uoUlHu9EZod+inXhKo3qQgwXUT/y1w==", + "dev": true, + "dependencies": { + "buffer-from": "^1.0.0", + "source-map": "^0.6.0" + } + }, "node_modules/spdx-correct": { "version": "3.1.1", "resolved": "https://registry.npmjs.org/spdx-correct/-/spdx-correct-3.1.1.tgz", @@ -4341,6 +4431,24 @@ "url": "https://github.com/sponsors/ljharb" } }, + "node_modules/terser": { + "version": "5.31.0", + "resolved": "https://registry.npmjs.org/terser/-/terser-5.31.0.tgz", + "integrity": "sha512-Q1JFAoUKE5IMfI4Z/lkE/E6+SwgzO+x4tq4v1AyBLRj8VSYvRO6A/rQrPg1yud4g0En9EKI1TvFRF2tQFcoUkg==", + "dev": true, + "dependencies": { + "@jridgewell/source-map": "^0.3.3", + "acorn": "^8.8.2", + "commander": "^2.20.0", + "source-map-support": "~0.5.20" + }, + "bin": { + "terser": "bin/terser" + }, + "engines": { + "node": ">=10" + } + }, "node_modules/text-table": { "version": "0.2.0", "resolved": "https://registry.npmjs.org/text-table/-/text-table-0.2.0.tgz", @@ -5009,6 +5117,55 @@ "integrity": "sha512-6EwiSjwWYP7pTckG6I5eyFANjPhmPjUX9JRLUSfNPC7FX7zK9gyZAfUEaECL6ALTpGX5AjnBq3C9XmVWPitNpw==", "dev": true }, + "@jridgewell/gen-mapping": { + "version": "0.3.5", + "resolved": "https://registry.npmjs.org/@jridgewell/gen-mapping/-/gen-mapping-0.3.5.tgz", + "integrity": "sha512-IzL8ZoEDIBRWEzlCcRhOaCupYyN5gdIK+Q6fbFdPDg6HqX6jpkItn7DFIpW9LQzXG6Df9sA7+OKnq0qlz/GaQg==", + "dev": true, + "requires": { + "@jridgewell/set-array": "^1.2.1", + "@jridgewell/sourcemap-codec": "^1.4.10", + "@jridgewell/trace-mapping": "^0.3.24" + } + }, + "@jridgewell/resolve-uri": { + "version": "3.1.2", + "resolved": "https://registry.npmjs.org/@jridgewell/resolve-uri/-/resolve-uri-3.1.2.tgz", + "integrity": "sha512-bRISgCIjP20/tbWSPWMEi54QVPRZExkuD9lJL+UIxUKtwVJA8wW1Trb1jMs1RFXo1CBTNZ/5hpC9QvmKWdopKw==", + "dev": true + }, + "@jridgewell/set-array": { + "version": "1.2.1", + "resolved": "https://registry.npmjs.org/@jridgewell/set-array/-/set-array-1.2.1.tgz", + "integrity": "sha512-R8gLRTZeyp03ymzP/6Lil/28tGeGEzhx1q2k703KGWRAI1VdvPIXdG70VJc2pAMw3NA6JKL5hhFu1sJX0Mnn/A==", + "dev": true + }, + "@jridgewell/source-map": { + "version": "0.3.6", + "resolved": "https://registry.npmjs.org/@jridgewell/source-map/-/source-map-0.3.6.tgz", + "integrity": "sha512-1ZJTZebgqllO79ue2bm3rIGud/bOe0pP5BjSRCRxxYkEZS8STV7zN84UBbiYu7jy+eCKSnVIUgoWWE/tt+shMQ==", + "dev": true, + "requires": { + "@jridgewell/gen-mapping": "^0.3.5", + "@jridgewell/trace-mapping": "^0.3.25" + } + }, + "@jridgewell/sourcemap-codec": { + "version": "1.4.15", + "resolved": "https://registry.npmjs.org/@jridgewell/sourcemap-codec/-/sourcemap-codec-1.4.15.tgz", + "integrity": "sha512-eF2rxCRulEKXHTRiDrDy6erMYWqNw4LPdQ8UQA4huuxaQsVeRPFl2oM8oDGxMFhJUWZf9McpLtJasDDZb/Bpeg==", + "dev": true + }, + "@jridgewell/trace-mapping": { + "version": "0.3.25", + "resolved": "https://registry.npmjs.org/@jridgewell/trace-mapping/-/trace-mapping-0.3.25.tgz", + "integrity": "sha512-vNk6aEwybGtawWmy/PzwnGDOjCkLWSD2wqvjGGAgOAwCGWySYXfYoxt00IJkTF+8Lb57DwOb3Aa0o9CApepiYQ==", + "dev": true, + "requires": { + "@jridgewell/resolve-uri": "^3.1.0", + "@jridgewell/sourcemap-codec": "^1.4.14" + } + }, "@jspm/core": { "version": "2.0.1", "resolved": "https://registry.npmjs.org/@jspm/core/-/core-2.0.1.tgz", @@ -5482,6 +5639,12 @@ "ieee754": "^1.2.1" } }, + "buffer-from": { + "version": "1.1.2", + "resolved": "https://registry.npmjs.org/buffer-from/-/buffer-from-1.1.2.tgz", + "integrity": "sha512-E+XQCRwSbaaiChtv6k6Dwgc+bx+Bs6vuKJHHl5kox/BaKbhiXzqQOwK4cO22yElGp2OCmjwVhT3HmxgyPGnJfQ==", + "dev": true + }, "builtin-modules": { "version": "3.3.0", "resolved": "https://registry.npmjs.org/builtin-modules/-/builtin-modules-3.3.0.tgz", @@ -5613,6 +5776,12 @@ "integrity": "sha512-qiBjkpbMLO/HL68y+lh4q0/O1MZFj2RX6X/KmMa3+gJD3z+WwI1ZzDHysvqHGS3mP6mznPckpXmw1nI9cJjyRg==", "dev": true }, + "commander": { + "version": "2.20.3", + "resolved": "https://registry.npmjs.org/commander/-/commander-2.20.3.tgz", + "integrity": "sha512-GpVkmM8vF2vQUkj2LvZmD35JxeJOLCwJ9cUkugyk2nuhbv3+mJvpLYYt+0+USMxE+oj+ey/lJEnhZw75x/OMcQ==", + "dev": true + }, "comment-parser": { "version": "1.4.0", "resolved": "https://registry.npmjs.org/comment-parser/-/comment-parser-1.4.0.tgz", @@ -7603,6 +7772,22 @@ "integrity": "sha512-g9Q1haeby36OSStwb4ntCGGGaKsaVSjQ68fBxoQcutl5fS1vuY18H3wSt3jFyFtrkx+Kz0V1G85A4MyAdDMi2Q==", "dev": true }, + "source-map": { + "version": "0.6.1", + "resolved": "https://registry.npmjs.org/source-map/-/source-map-0.6.1.tgz", + "integrity": "sha512-UjgapumWlbMhkBgzT7Ykc5YXUT46F0iKu8SGXq0bcwP5dz/h0Plj6enJqjz1Zbq2l5WaqYnrVbwWOWMyF3F47g==", + "dev": true + }, + "source-map-support": { + "version": "0.5.21", + "resolved": "https://registry.npmjs.org/source-map-support/-/source-map-support-0.5.21.tgz", + "integrity": "sha512-uBHU3L3czsIyYXKX88fdrGovxdSCoTGDRZ6SYXtSRxLZUzHg5P/66Ht6uoUlHu9EZod+inXhKo3qQgwXUT/y1w==", + "dev": true, + "requires": { + "buffer-from": "^1.0.0", + "source-map": "^0.6.0" + } + }, "spdx-correct": { "version": "3.1.1", "resolved": "https://registry.npmjs.org/spdx-correct/-/spdx-correct-3.1.1.tgz", @@ -7733,6 +7918,18 @@ "integrity": "sha512-ot0WnXS9fgdkgIcePe6RHNk1WA8+muPa6cSjeR3V8K27q9BB1rTE3R1p7Hv0z1ZyAc8s6Vvv8DIyWf681MAt0w==", "dev": true }, + "terser": { + "version": "5.31.0", + "resolved": "https://registry.npmjs.org/terser/-/terser-5.31.0.tgz", + "integrity": "sha512-Q1JFAoUKE5IMfI4Z/lkE/E6+SwgzO+x4tq4v1AyBLRj8VSYvRO6A/rQrPg1yud4g0En9EKI1TvFRF2tQFcoUkg==", + "dev": true, + "requires": { + "@jridgewell/source-map": "^0.3.3", + "acorn": "^8.8.2", + "commander": "^2.20.0", + "source-map-support": "~0.5.20" + } + }, "text-table": { "version": "0.2.0", "resolved": "https://registry.npmjs.org/text-table/-/text-table-0.2.0.tgz", diff --git a/js/package.json b/js/package.json index 63b7df6ed9de3..308d6931a927c 100644 --- a/js/package.json +++ b/js/package.json @@ -21,6 +21,7 @@ "mocha": "^10.2.0", "npmlog": "^7.0.1", "prettier": "^3.0.3", + "terser": "^5.31.0", "typescript": "^5.2.2" }, "scripts": { diff --git a/js/react_native/e2e/ios/Podfile b/js/react_native/e2e/ios/Podfile index bec13598229cd..4bf19f965c553 100644 --- a/js/react_native/e2e/ios/Podfile +++ b/js/react_native/e2e/ios/Podfile @@ -1,7 +1,7 @@ require_relative '../node_modules/react-native/scripts/react_native_pods' require_relative '../node_modules/@react-native-community/cli-platform-ios/native_modules' -platform :ios, '12.4' +platform :ios, '13.0' target 'OnnxruntimeModuleExample' do config = use_native_modules! diff --git a/js/react_native/ios/Podfile b/js/react_native/ios/Podfile index b5bd197d1ebd9..e3887e327b7af 100644 --- a/js/react_native/ios/Podfile +++ b/js/react_native/ios/Podfile @@ -1,7 +1,7 @@ require_relative '../node_modules/react-native/scripts/react_native_pods' require_relative '../node_modules/@react-native-community/cli-platform-ios/native_modules' -platform :ios, '12.4' +platform :ios, '13.0' def shared config = use_native_modules! diff --git a/js/react_native/onnxruntime-react-native.podspec b/js/react_native/onnxruntime-react-native.podspec index 914a396be1f1d..50eba7dfaa1e0 100644 --- a/js/react_native/onnxruntime-react-native.podspec +++ b/js/react_native/onnxruntime-react-native.podspec @@ -15,7 +15,7 @@ Pod::Spec.new do |spec| spec.license = package["license"] spec.authors = package["author"] - spec.platforms = { :ios => "12.4" } + spec.platforms = { :ios => "13.0" } spec.source = { :git => "https://github.com/Microsoft/onnxruntime.git", :tag => "rel-#{spec.version}" } spec.source_files = "ios/*.{h,mm}" diff --git a/js/web/docs/webgpu-operators.md b/js/web/docs/webgpu-operators.md index 3af4942c2e4aa..919b005ec4c21 100644 --- a/js/web/docs/webgpu-operators.md +++ b/js/web/docs/webgpu-operators.md @@ -74,6 +74,7 @@ Do not modify directly.* | Not | ai.onnx(1+) | | | Pad | ai.onnx(2-10,11-12,13-17,18,19+) | | | Pow | ai.onnx(7-11,12,13-14,15+) | | +| QuickGelu | com.microsoft(1+) | | | Range | ai.onnx(11+) | | | Reciprocal | ai.onnx(6-12,13+) | | | ReduceL1 | ai.onnx(1-10,11-12,13-17,18+) | | diff --git a/js/web/docs/webnn-operators.md b/js/web/docs/webnn-operators.md index bcabb6896f339..966c93a85ae2a 100644 --- a/js/web/docs/webnn-operators.md +++ b/js/web/docs/webnn-operators.md @@ -13,19 +13,19 @@ operators and the supported opset domain/versions in **WebNN EP** by ONNX Runtim |:------:|:------:|:------:|:-:|:-:|:------| | Abs | ai.onnx(7-12, 13+) | abs | ✓ | ✓ | | | Add | ai.onnx(7-12, 13, 14+) | add | ✓ | ✓ | | -| ArgMax | ai.onnx(7-10, 11, 12, 13+) | argMax | ✗ | ✓ | | -| ArgMin | ai.onnx(7-10, 11, 12, 13+) | argMin | ✗ | ✓ | | +| ArgMax | ai.onnx(7-10, 11, 12, 13+) | argMax | ✓ | ✓ | WebNN CPU backend only supports 'select_last_index' value is 0 | +| ArgMin | ai.onnx(7-10, 11, 12, 13+) | argMin | ✓ | ✓ | WebNN CPU backend only supports 'select_last_index' value is 0 | | AveragePool | ai.onnx(7-9, 10, 11, 12-18, 19+) | averagePool2d | ✓ | ✓ | Only supports 4-D input, 2-D 'kernel_shape', 'count_include_pad' value is 0 | | BatchNormalization | ai.onnx(7-8, 9-13, 14, 15+) | batchNormalization | ✗ | ✓ | Only supports 'training_mode' value is 0, one output | | Cast | ai.onnx(7-8, 9-12, 13-18, 19-20, 21+) | cast | ✗ | ✓ | | | Ceil | ai.onnx(7-12, 13+) | ceil | ✓ | ✓ | | -| Clip | ai.onnx(7-10, 11, 12, 13+) | clamp | ✓ | ✓ | | +| Clip | ai.onnx(7-10, 11, 12, 13+) | clamp | ✓ | ✓ | WebNN CPU backend only supports 3 specific ranges: [0.0, infinity], [-1.0, 1.0], [0.0, 6.0] (Chromium issue: https://issues.chromium.org/issues/326156496) | | Concat | ai.onnx(7-10, 11-12, 13+) | concat | ✓ | ✓ | | | Conv | ai.onnx(7-10, 11+) | conv2d | ✓ | ✓ | Only supports 3-D or 4-D input and 'W' (weight). WebNN CPU requires the 'W' (weight) input to be a constant | | ConvTranspose | ai.onnx(7-10, 11+) | convTranspose2d | ✓ | ✗ | Only supports 3-D or 4-D input and 'W' (weight). | | Cos | ai.onnx(7+) | cos | ✗ | ✓ | | | Div | ai.onnx(7-12, 13, 14+) | div | ✓ | ✓ | | -| Elu | ai.onnx(7+) | elu | ✓ | ✓ | | +| Elu | ai.onnx(7+) | elu | ✓ | ✓ | WebNN CPU backend only supports 'alpha' value is 1.0 | | Equal | ai.onnx(7-10, 11-12, 13-18, 19+) | equal | ✗ | ✓ | | | Erf | ai.onnx(7-9, 10-12, 13+) | erf | ✗ | ✓ | | | Exp | ai.onnx(7-12, 13+) | exp | ✗ | ✓ | | @@ -50,7 +50,7 @@ operators and the supported opset domain/versions in **WebNN EP** by ONNX Runtim | LessOrEqual | ai.onnx(12-15, 16+) | lesserOrEqual | ✗ | ✓ | | | Log | ai.onnx(7-12, 13+) | log | ✗ | ✓ | | | LpPool | ai.onnx(7-10, 11-17, 18+) | l2Pool2d | ✗ | ✓ | Only supports 4-D input, 2-D 'kernel_shape', 'p' value is 2 | -| MatMul | ai.onnx(7-8, 9-12, 13+) | matmul | ✓ | ✓ | WebNN CPU doesn't support broadcasting for MatMul | +| MatMul | ai.onnx(7-8, 9-12, 13+) | matmul | ✓ | ✓ | | | Max | ai.onnx(7, 8-11, 12, 13+) | max | ✓ | ✓ | | | MaxPool | ai.onnx(7, 8-9, 10, 11, 12+) | maxPool2d | ✓ | ✓ | Only supports 4-D input, 2-D 'kernel_shape', 'storage_order' != 1, one output | | Min | ai.onnx(7, 8-11, 12, 13+) | min | ✓ | ✓ | | @@ -73,7 +73,7 @@ operators and the supported opset domain/versions in **WebNN EP** by ONNX Runtim | ReduceSumSquare | ai.onnx(7-10, 11-12, 13-17, 18+) | reduceSumSquare | ✗ | ✓ | Input 'axes' if present should be a constant | | Relu | ai.onnx(7-12, 13, 14+) | relu | ✓ | ✓ | | | Reshape | ai.onnx(7-12, 13, 14-18, 19-20, 21+) | reshape | ✓ | ✓ | Input 'shape' should be a constant, 0 dimension value in 'shape' is not supported | -| Resize | ai.onnx(11-12, 13-17, 18, 19+) | resample2d | ✓ | ✓ | Only supports 4-D input, exclude_outside != 0, input 'scales' and 'sizes' if present must be a constant, WebNN CPU backend only supports 'linear' mode, WebNN GPU backend only supports 'linear' and 'nearest' modes | +| Resize | ai.onnx(11-12, 13-17, 18, 19+) | resample2d | ✓ | ✓ | Only supports 4-D input, exclude_outside != 0, input 'scales' and 'sizes' if present must be a constant, 'linear' and 'nearest' modes | | Shape | ai.onnx(7-12, 13-14, 15-18, 19-20, 21+) | slice | ✓ | ✓ | | | Sigmoid | ai.onnx(7-12, 13+) | sigmoid | ✓ | ✓ | | | Softplus | ai.onnx(7+) | softplus | ✗ | ✓ | | @@ -81,7 +81,7 @@ operators and the supported opset domain/versions in **WebNN EP** by ONNX Runtim | Sin | ai.onnx(7+) | sin | ✗ | ✓ | | | Slice | ai.onnx(7-9, 10, 11-12, 13+) | slice | ✓ | ✓ | Input 'starts', 'ends', 'axes', and 'steps' if present must be a constant, only supports 'steps' value 1 | | Softmax | ai.onnx(7-10, 11-12, 13+) | softmax | ✓ | ✓ | Only supports input rank >= 2 | -| Split | ai.onnx(7-10, 11-12, 13-17, 18+) | split | ✓ | ✓ | Input 'split' if present should be a constant, WebNN CPU backend only supports up to 4 outputs | +| Split | ai.onnx(7-10, 11-12, 13-17, 18+) | split | ✓ | ✓ | Input 'split' if present should be a constant | | Sqrt | ai.onnx(7-12, 13+) | sqrt | ✓ | ✓ | | | Squeeze | ai.onnx(7-10, 11-12, 13-20, 21+) | reshape | ✓ | ✓ | Input 'axes' if present should be a constant | | Sub | ai.onnx(7-12, 13, 14+) | sub | ✓ | ✓ | | diff --git a/js/web/lib/build-def.d.ts b/js/web/lib/build-def.d.ts index 4f30e71d690a3..188aaebc7d187 100644 --- a/js/web/lib/build-def.d.ts +++ b/js/web/lib/build-def.d.ts @@ -32,6 +32,10 @@ interface BuildDefinitions { * defines whether to disable training APIs in WebAssembly backend. */ readonly DISABLE_TRAINING: boolean; + /** + * defines whether to disable dynamic importing WASM module in the build. + */ + readonly DISABLE_DYNAMIC_IMPORT: boolean; // #endregion diff --git a/js/web/lib/wasm/jsep/webgpu/op-resolve-rules.ts b/js/web/lib/wasm/jsep/webgpu/op-resolve-rules.ts index 2d2f345d0c273..ce5b4455fde60 100644 --- a/js/web/lib/wasm/jsep/webgpu/op-resolve-rules.ts +++ b/js/web/lib/wasm/jsep/webgpu/op-resolve-rules.ts @@ -107,6 +107,7 @@ export const WEBGPU_OP_RESOLVE_RULES: Map = new ['Not', [unaryOps.not]], ['Pad', [pad]], ['Pow', [binaryOps.pow]], + ['QuickGelu', [unaryOps.quickgelu, unaryOps.parseAlphaAttributes]], ['Range', [range]], ['Reciprocal', [unaryOps.reciprocal]], ['ReduceMin', [reduceMin]], diff --git a/js/web/lib/wasm/jsep/webgpu/ops/unary-op.ts b/js/web/lib/wasm/jsep/webgpu/ops/unary-op.ts index 5f105c745739e..12ba2a10cdf9f 100644 --- a/js/web/lib/wasm/jsep/webgpu/ops/unary-op.ts +++ b/js/web/lib/wasm/jsep/webgpu/ops/unary-op.ts @@ -314,3 +314,31 @@ export const thresholdedRelu = (context: ComputeContext, attributes: AlphaAttrib export const log = (context: ComputeContext): void => { context.compute(createElementwiseProgramInfo(context.inputs[0], 'Log', 'log')); }; + +export const quickGeluImpl = (varType: string, alpha: number) => ` +const alpha = vec4<${varType}>(${alpha}); +const one = ${varType}(1.0); +const zero = ${varType}(0.0); + +fn quick_gelu_impl(x: vec4<${varType}>) -> vec4<${varType}> { + let v = x *alpha; + var x1 : vec4<${varType}>; + for (var i = 0; i < 4; i = i + 1) { + if (v[i] >= zero) { + x1[i] = one / (one + exp(-v[i])); + } else { + x1[i] = one - one / (one + exp(v[i])); + } + } + return x * x1; +} +`; + +export const quickGeluExpression = (x: string) => `quick_gelu_impl(${x})`; + +export const quickgelu = (context: ComputeContext, attributes: AlphaAttributes): void => { + const dType = tensorTypeToWsglValueType(context.inputs[0].dataType); + context.compute(createElementwiseProgramInfo( + context.inputs[0], 'QuickGelu', quickGeluExpression, quickGeluImpl(dType, attributes.alpha), attributes.cacheKey, + context.inputs[0].dataType)); +}; diff --git a/js/web/lib/wasm/wasm-utils-import.ts b/js/web/lib/wasm/wasm-utils-import.ts index c14941ee6afbe..f80bd7195d456 100644 --- a/js/web/lib/wasm/wasm-utils-import.ts +++ b/js/web/lib/wasm/wasm-utils-import.ts @@ -121,12 +121,28 @@ export const importProxyWorker = async(): Promise<[undefined | string, Worker]> return [url, createProxyWorker!(url)]; }; +/** + * The embedded WebAssembly module. + * + * This is only available in ESM and when embedding is not disabled. + */ +const embeddedWasmModule: EmscriptenModuleFactory|undefined = + BUILD_DEFS.IS_ESM && BUILD_DEFS.DISABLE_DYNAMIC_IMPORT ? + // eslint-disable-next-line @typescript-eslint/no-require-imports, @typescript-eslint/no-var-requires + require( + !BUILD_DEFS.DISABLE_TRAINING ? '../../dist/ort-training-wasm-simd-threaded.mjs' : + !BUILD_DEFS.DISABLE_JSEP ? '../../dist/ort-wasm-simd-threaded.jsep.mjs' : + '../../dist/ort-wasm-simd-threaded.mjs') + .default : + undefined; + /** * Import the WebAssembly module. * * This function will perform the following steps: - * 1. If a preload is needed, it will preload the module and return the object URL. - * 2. Otherwise, it will perform a dynamic import of the module. + * 1. If BUILD_DEFS.DISABLE_DYNAMIC_IMPORT is true, use the embedded module. + * 2. If a preload is needed, it will preload the module and return the object URL. + * 3. Otherwise, it will perform a dynamic import of the module. * * @returns - A promise that resolves to a tuple of 2 elements: * - The object URL of the preloaded module, or undefined if no preload is needed. @@ -135,22 +151,26 @@ export const importProxyWorker = async(): Promise<[undefined | string, Worker]> export const importWasmModule = async( urlOverride: string|undefined, prefixOverride: string|undefined, isMultiThreaded: boolean): Promise<[undefined | string, EmscriptenModuleFactory]> => { - const wasmModuleFilename = !BUILD_DEFS.DISABLE_TRAINING ? 'ort-training-wasm-simd-threaded.mjs' : - !BUILD_DEFS.DISABLE_JSEP ? 'ort-wasm-simd-threaded.jsep.mjs' : - 'ort-wasm-simd-threaded.mjs'; - const wasmModuleUrl = urlOverride ?? normalizeUrl(wasmModuleFilename, prefixOverride); - // need to preload if all of the following conditions are met: - // 1. not in Node.js. - // - Node.js does not have the same origin policy for creating workers. - // 2. multi-threaded is enabled. - // - If multi-threaded is disabled, no worker will be created. So we don't need to preload the module. - // 3. the absolute URL is available. - // - If the absolute URL is failed to be created, the origin cannot be determined. In this case, we will not - // preload the module. - // 4. the worker URL is not from the same origin. - // - If the worker URL is from the same origin, we can create the worker directly. - const needPreload = !isNode && isMultiThreaded && wasmModuleUrl && !isSameOrigin(wasmModuleUrl, prefixOverride); - const url = - needPreload ? (await preload(wasmModuleUrl)) : (wasmModuleUrl ?? fallbackUrl(wasmModuleFilename, prefixOverride)); - return [needPreload ? url : undefined, await dynamicImportDefault>(url)]; + if (BUILD_DEFS.DISABLE_DYNAMIC_IMPORT) { + return [undefined, embeddedWasmModule!]; + } else { + const wasmModuleFilename = !BUILD_DEFS.DISABLE_TRAINING ? 'ort-training-wasm-simd-threaded.mjs' : + !BUILD_DEFS.DISABLE_JSEP ? 'ort-wasm-simd-threaded.jsep.mjs' : + 'ort-wasm-simd-threaded.mjs'; + const wasmModuleUrl = urlOverride ?? normalizeUrl(wasmModuleFilename, prefixOverride); + // need to preload if all of the following conditions are met: + // 1. not in Node.js. + // - Node.js does not have the same origin policy for creating workers. + // 2. multi-threaded is enabled. + // - If multi-threaded is disabled, no worker will be created. So we don't need to preload the module. + // 3. the absolute URL is available. + // - If the absolute URL is failed to be created, the origin cannot be determined. In this case, we will not + // preload the module. + // 4. the worker URL is not from the same origin. + // - If the worker URL is from the same origin, we can create the worker directly. + const needPreload = !isNode && isMultiThreaded && wasmModuleUrl && !isSameOrigin(wasmModuleUrl, prefixOverride); + const url = needPreload ? (await preload(wasmModuleUrl)) : + (wasmModuleUrl ?? fallbackUrl(wasmModuleFilename, prefixOverride)); + return [needPreload ? url : undefined, await dynamicImportDefault>(url)]; + } }; diff --git a/js/web/script/build.ts b/js/web/script/build.ts index 7ef9bb6b70347..eba5efa3f11e0 100644 --- a/js/web/script/build.ts +++ b/js/web/script/build.ts @@ -57,6 +57,7 @@ const DEFAULT_DEFINE = { 'BUILD_DEFS.DISABLE_WASM': 'false', 'BUILD_DEFS.DISABLE_WASM_PROXY': 'false', 'BUILD_DEFS.DISABLE_TRAINING': 'true', + 'BUILD_DEFS.DISABLE_DYNAMIC_IMPORT': 'false', 'BUILD_DEFS.IS_ESM': 'false', 'BUILD_DEFS.ESM_IMPORT_META_URL': 'undefined', @@ -76,7 +77,102 @@ interface OrtBuildOptions { readonly define?: Record; } -const esbuildAlreadyBuilt = new Map(); +const terserAlreadyBuilt = new Map(); + +/** + * This function is only used to minify the Emscripten generated JS code. The ESBuild minify option is not able to + * tree-shake some unused code as expected. Specifically, there are 2 issues: + * 1. the use of `await import("module")` + * 2. the use of `await import("worker_threads")`, with top-level "await". + * + * The 2 code snippets mentioned above are guarded by feature checks to make sure they are only run in Node.js. However, + * ESBuild fails to tree-shake them and will include them in the final bundle. It will generate code like this: + * + * ```js + * // original code (example, not exact generated code) + * var isNode = typeof process !== 'undefined' && process.versions?.node; + * if (isNode) { + * const {createRequire} = await import('module'); + * ... + * } + * + * // minimized code (with setting "define: {'process': 'undefined'}") + * var x=!0;if(x){const{createRequire:rt}=await import("module");...} + * ``` + * + * The remaining dynamic import call makes trouble for further building steps. To solve this issue, we use Terser to + * minify the Emscripten generated JS code. Terser does more aggressive optimizations and is able to tree-shake the + * unused code with special configurations. + * + * We assume the minimized code does not contain any dynamic import calls. + */ +async function minifyWasmModuleJsForBrowser(filepath: string): Promise { + const code = terserAlreadyBuilt.get(filepath); + if (code) { + return code; + } + + const doMinify = (async () => { + const TIME_TAG = `BUILD:terserMinify:${filepath}`; + console.time(TIME_TAG); + + const contents = await fs.readFile(filepath, {encoding: 'utf-8'}); + + // Find the first and the only occurrence of minified function implementation of "_emscripten_thread_set_strongref": + // ```js + // _emscripten_thread_set_strongref: (thread) => { + // if (ENVIRONMENT_IS_NODE) { + // PThread.pthreads[thread].ref(); + // } + // } + // ``` + // + // It is minified to: (example) + // ```js + // function Pb(a){D&&N[a>>>0].ref()} + // ``` + + // The following code will look for the function name and mark the function call as pure, so that Terser will + // minify the code correctly. + + const markedAsPure = []; + // First, try if we are working on the original (not minified) source file. This is when we are working with the + // debug build. + const isOriginal = contents.includes('PThread.pthreads[thread].ref()'); + if (isOriginal) { + markedAsPure.push('PThread.pthreads[thread].ref'); + } else { + // If it is not the original source file, we need to find the minified function call. + const matches = [...contents.matchAll(/\{[_a-zA-Z][_a-zA-Z0-9]*&&([_a-zA-Z][_a-zA-Z0-9]*\[.+?]\.ref)\(\)}/g)]; + if (matches.length !== 1) { + throw new Error(`Unexpected number of matches for minified "PThread.pthreads[thread].ref()" in "${filepath}": ${ + matches.length}.`); + } + // matches[0] is the first and the only match. + // matches[0][0] is the full matched string and matches[0][1] is the first capturing group. + markedAsPure.push(matches[0][1]); + } + + const terser = await import('terser'); + const result = await terser.minify(contents, { + module: true, + compress: { + passes: 2, + global_defs: {'process': undefined, 'globalThis.process': undefined}, + pure_funcs: markedAsPure, + }, + }); + + console.timeEnd(TIME_TAG); + + return result.code!; + })(); + + terserAlreadyBuilt.set(filepath, doMinify); + return doMinify; +} + +const esbuildAlreadyBuilt = new Map(); async function buildBundle(options: esbuild.BuildOptions) { // Skip if the same build options have been built before. const serializedOptions = JSON.stringify(options); @@ -162,18 +258,31 @@ async function buildOrt({ const platform = isNode ? 'node' : 'browser'; const external = isNode ? ['onnxruntime-common'] : ['node:fs/promises', 'node:fs', 'node:os', 'module', 'worker_threads']; + const plugins: esbuild.Plugin[] = []; const defineOverride: Record = {}; if (!isNode) { defineOverride.process = 'undefined'; defineOverride['globalThis.process'] = 'undefined'; } + if (define['BUILD_DEFS.DISABLE_DYNAMIC_IMPORT'] === 'true') { + plugins.push({ + name: 'emscripten-mjs-handler', + setup(build: esbuild.PluginBuild) { + build.onLoad( + {filter: /dist[\\/]ort-.*wasm.*\.mjs$/}, + async args => ({contents: await minifyWasmModuleJsForBrowser(args.path)})); + } + }); + } + await buildBundle({ entryPoints: ['web/lib/index.ts'], outfile: `web/dist/${outputName}${isProduction ? '.min' : ''}.${format === 'esm' ? 'mjs' : 'js'}`, platform, format, globalName: 'ort', + plugins, external, define: {...define, ...defineOverride}, sourcemap: isProduction ? 'linked' : 'inline', @@ -280,8 +389,8 @@ async function postProcess() { } } if (!found) { - if (file.includes('webgl')) { - // skip webgl + if (file.includes('.webgl.') || file.includes('.bundle.')) { + // skip webgl and bundle, they don't have dynamic import calls. continue; } throw new Error(`Dynamic import call not found in "${jsFilePath}". Should not happen.`); @@ -363,7 +472,7 @@ async function validate() { // all files should contain the magic comment to ignore dynamic import calls. // - if (!file.includes('webgl') && !file.startsWith('ort.esm.')) { + if (!file.includes('.webgl.') && !file.includes('.bundle.')) { const contentToSearch = isMinified ? '/*webpackIgnore:true*/' : '/* webpackIgnore: true */'; if (!content.includes(contentToSearch)) { throw new Error(`Validation failed: "${file}" does not contain magic comment.`); @@ -457,17 +566,40 @@ async function main() { if (BUNDLE_MODE === 'prod') { // ort.all[.min].[m]js await addAllWebBuildTasks({outputName: 'ort.all'}); + // ort.all.bundle.min.mjs + await buildOrt({ + isProduction: true, + outputName: 'ort.all.bundle', + format: 'esm', + define: {...DEFAULT_DEFINE, 'BUILD_DEFS.DISABLE_DYNAMIC_IMPORT': 'true'}, + }); // ort[.min].[m]js await addAllWebBuildTasks({ outputName: 'ort', define: {...DEFAULT_DEFINE, 'BUILD_DEFS.DISABLE_JSEP': 'true'}, }); + // ort.bundle.min.mjs + await buildOrt({ + isProduction: true, + outputName: 'ort.bundle', + format: 'esm', + define: {...DEFAULT_DEFINE, 'BUILD_DEFS.DISABLE_JSEP': 'true', 'BUILD_DEFS.DISABLE_DYNAMIC_IMPORT': 'true'}, + }); + // ort.webgpu[.min].[m]js await addAllWebBuildTasks({ outputName: 'ort.webgpu', define: {...DEFAULT_DEFINE, 'BUILD_DEFS.DISABLE_WEBGL': 'true'}, }); + // ort.webgpu.bundle.min.mjs + await buildOrt({ + isProduction: true, + outputName: 'ort.webgpu.bundle', + format: 'esm', + define: {...DEFAULT_DEFINE, 'BUILD_DEFS.DISABLE_WEBGL': 'true', 'BUILD_DEFS.DISABLE_DYNAMIC_IMPORT': 'true'}, + }); + // ort.wasm[.min].[m]js await addAllWebBuildTasks({ outputName: 'ort.wasm', diff --git a/js/web/test/data/ops/quick-gelu.jsonc b/js/web/test/data/ops/quick-gelu.jsonc new file mode 100644 index 0000000000000..a6e618fe34796 --- /dev/null +++ b/js/web/test/data/ops/quick-gelu.jsonc @@ -0,0 +1,46 @@ +[ + { + "name": "QuickGelu test", + "operator": "QuickGelu", + "opset": { "domain": "com.microsoft", "version": 1 }, + "cases": [ + { + "name": "[2x4]", + "inputs": [ + { + "data": [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, -0.8], + "dims": [2, 4], + "type": "float32" + } + ], + "outputs": [ + { + "data": [0.0542447, 0.116857, 0.187484, 0.265566, 0.350388, 0.441123, 0.53689, 0.636815], + "dims": [2, 4], + "type": "float32" + } + ] + }, + { + "name": "[3x5]", + "inputs": [ + { + "data": [0.1, 0.2, 0.3, 0.4, 0.5, 1, 2, 3, 4, 5, 1.1, 1.2, 1.3, 1.4, -1.5], + "dims": [3, 5], + "type": "float32" + } + ], + "outputs": [ + { + "data": [ + 0.0542447, 0.116857, 0.187484, 0.265566, 0.350388, 0.845795, 1.9356, 2.98192, 3.99558, 4.99899, 0.953383, + 1.0622, 1.17178, 1.2817, 1.39166 + ], + "dims": [3, 5], + "type": "float32" + } + ] + } + ] + } +] diff --git a/js/web/test/e2e/run-data.js b/js/web/test/e2e/run-data.js index 58371bafd276d..507192f29be9c 100644 --- a/js/web/test/e2e/run-data.js +++ b/js/web/test/e2e/run-data.js @@ -30,6 +30,12 @@ const BROWSER_TEST_CASES = [ [true, true, './browser-test-wasm.js', 'ort.min.mjs', ['num_threads=2', 'proxy=1']], // wasm, 2 threads, proxy [true, true, './browser-test-wasm.js', 'ort.min.mjs', ['num_threads=1', 'proxy=1']], // wasm, 1 thread, proxy + // ort.bundle.min.mjs + [true, false, './browser-test-wasm.js', 'ort.bundle.min.mjs', ['num_threads=1']], // 1 thread + [true, false, './browser-test-wasm.js', 'ort.bundle.min.mjs', ['num_threads=2']], // 2 threads + [true, false, './browser-test-wasm.js', 'ort.bundle.min.mjs', ['num_threads=2', 'proxy=1']], // 2 threads, proxy + [true, false, './browser-test-wasm.js', 'ort.bundle.min.mjs', ['num_threads=1', 'proxy=1']], // 1 thread, proxy + // path override: // wasm, path override filenames for both mjs and wasm, same origin [true, false, './browser-test-wasm-path-override-filename.js', 'ort.min.js', ['port=9876', 'files=mjs,wasm']], diff --git a/onnxruntime/contrib_ops/cpu/bert/attention_cpu_base.h b/onnxruntime/contrib_ops/cpu/bert/attention_cpu_base.h index 34f57c1655cc2..8ae7b4589d677 100644 --- a/onnxruntime/contrib_ops/cpu/bert/attention_cpu_base.h +++ b/onnxruntime/contrib_ops/cpu/bert/attention_cpu_base.h @@ -69,9 +69,8 @@ class AttentionCPUBase : public AttentionBase { BufferUniquePtr mask_data_buffer(mask_data, BufferDeleter(allocator)); const int32_t* mask_index_data = mask_index != nullptr ? mask_index->Data() : nullptr; - gsl::span mask_index_dims = mask_index != nullptr - ? mask_index->Shape().GetDims() - : gsl::span{}; + gsl::span mask_index_dims = + mask_index != nullptr ? mask_index->Shape().GetDims() : gsl::span{}; const T* past_data = past != nullptr ? past->Data() : nullptr; T* present_data = present != nullptr ? present->MutableData() : nullptr; const T* past_key_data = past_key != nullptr ? past_key->Data() : nullptr; @@ -84,22 +83,19 @@ class AttentionCPUBase : public AttentionBase { relative_position_bias_data = relative_position_bias->Data(); } - ComputeAttentionProbs(static_cast(attention_probs), Q, K, - mask_index_data, mask_index_dims, static_cast(mask_data), causal, - batch_size, sequence_length, kv_sequence_length, past_sequence_length, - qk_head_size == 0 ? v_head_size : qk_head_size, past_data, past_key_data, - present_data, present_key_data, tp, relative_position_bias_data); + ComputeAttentionProbs(static_cast(attention_probs), Q, K, mask_index_data, mask_index_dims, + static_cast(mask_data), causal, batch_size, sequence_length, kv_sequence_length, + past_sequence_length, qk_head_size == 0 ? v_head_size : qk_head_size, past_data, + past_key_data, present_data, present_key_data, tp, relative_position_bias_data); // Compute the attentionScore * Value: out_tmp(B, N, S, H_v) = attention_probs(B, N, S, T) x V(B, N, T, H_v) auto out_tmp_data = allocator->Alloc(SafeInt(batch_size) * num_heads_ * sequence_length * v_head_size * sizeof(T)); BufferUniquePtr out_tmp_buffer(out_tmp_data, BufferDeleter(std::move(allocator))); - ComputeVxAttentionScore(output->MutableData(), static_cast(out_tmp_data), - static_cast(attention_probs), V, - batch_size, sequence_length, kv_sequence_length, past_sequence_length, - v_head_size, v_hidden_size, past_data, past_value_data, - present_data, present_value_data, tp); + ComputeVxAttentionScore(output->MutableData(), static_cast(out_tmp_data), static_cast(attention_probs), + V, batch_size, sequence_length, kv_sequence_length, past_sequence_length, v_head_size, + v_hidden_size, past_data, past_value_data, present_data, present_value_data, tp); return Status::OK(); } @@ -138,16 +134,17 @@ class AttentionCPUBase : public AttentionBase { { // mask_data is nullptr when mask_index is nullptr and not unidirectional, otherwise its shape is BxSxT if (mask_data != nullptr) { - PrepareMask(mask_index, mask_index_dims, mask_data, - causal, batch_size, sequence_length, past_sequence_length, mask_filter_value_); + PrepareMask(mask_index, mask_index_dims, mask_data, causal, batch_size, sequence_length, past_sequence_length, + mask_filter_value_); } const int loop_len = batch_size * num_heads_; const float alpha = scale_ == 0.0f ? 1.0f / sqrt(static_cast(head_size)) : scale_; TensorOpCost unit_cost; - const size_t probs_matrix_bytes = SafeInt(sequence_length) * total_sequence_length * sizeof(T); - unit_cost.compute_cycles = static_cast(2 * sequence_length * head_size * total_sequence_length); + const ptrdiff_t probs_matrix_bytes = SafeInt(sequence_length) * total_sequence_length * sizeof(T); + unit_cost.compute_cycles = + static_cast(SafeInt(2) * sequence_length * head_size * total_sequence_length); unit_cost.bytes_loaded = static_cast((sequence_length + total_sequence_length) * head_size * sizeof(T)); unit_cost.bytes_stored = static_cast(probs_matrix_bytes); @@ -172,15 +169,13 @@ class AttentionCPUBase : public AttentionBase { for (std::ptrdiff_t i = begin; i != end; ++i) { const int batch_index = static_cast(i) / num_heads_; - const int output_offset = static_cast(i) * sequence_length * total_sequence_length; - const int mask_offset = batch_index * sequence_length * total_sequence_length; + const ptrdiff_t output_offset = SafeInt(i) * sequence_length * total_sequence_length; + const ptrdiff_t mask_offset = SafeInt(batch_index) * sequence_length * total_sequence_length; T* output = attention_probs + output_offset; // Broadcast mask data: (Bx)SxT -> (BxNx)SxT if (mask_data != nullptr) { - memcpy(output, - mask_data + mask_offset, - probs_matrix_bytes); + memcpy(output, mask_data + mask_offset, probs_matrix_bytes); } const T* k = K + kv_input_chunk_length * i; @@ -197,8 +192,8 @@ class AttentionCPUBase : public AttentionBase { // B: K' (B x N x) T x H (B x N x) H x T H x T // C: attention_probs (B x N x) S x T (B x N x) S x T S x T math::Gemm(CblasNoTrans, CblasTrans, sequence_length, total_sequence_length, head_size, alpha, - Q + q_input_chunk_length * i, k, mask_data != nullptr ? 1.0f : 0.0f, - output, nullptr); + Q + q_input_chunk_length * i, k, mask_data != nullptr ? 1.0f : 0.0f, output, + nullptr); if (relative_position_bias_data != nullptr) { for (int j = 0; j < sequence_length * total_sequence_length; j++) { @@ -249,8 +244,10 @@ class AttentionCPUBase : public AttentionBase { // The cost of Gemm TensorOpCost unit_cost; - unit_cost.compute_cycles = static_cast(2 * sequence_length * v_head_size * total_sequence_length); - unit_cost.bytes_loaded = static_cast((sequence_length + v_head_size) * total_sequence_length * sizeof(T)); + unit_cost.compute_cycles = + static_cast(SafeInt(2) * sequence_length * v_head_size * total_sequence_length); + unit_cost.bytes_loaded = + static_cast(SafeInt(sequence_length + v_head_size) * total_sequence_length * sizeof(T)); unit_cost.bytes_stored = static_cast(sequence_length * v_head_size * sizeof(T)); if (present || present_value) { @@ -264,35 +261,36 @@ class AttentionCPUBase : public AttentionBase { unit_cost.bytes_loaded += bytes_to_copy_trans_all; unit_cost.bytes_stored += bytes_to_copy_trans_all; - ThreadPool::TryParallelFor(tp, SafeInt(batch_size) * num_heads_, unit_cost, [&](std::ptrdiff_t begin, std::ptrdiff_t end) { - for (std::ptrdiff_t i = begin; i != end; ++i) { - const T* v = V + kv_input_chunk_length * i; - if (nullptr != present) { - // Concatenate past_V and V: (BxNx)PxH_v, (BxNx)LxH_v -> (BxNx)TxH_v - v = ConcatStateChunk(past, v, present, past_chunk_length, present_chunk_length, i); - } else if (nullptr != present_value) { - v = ConcatStateChunk(past_value, v, present_value, past_chunk_length, present_chunk_length, i); - } + ThreadPool::TryParallelFor( + tp, SafeInt(batch_size) * num_heads_, unit_cost, [&](std::ptrdiff_t begin, std::ptrdiff_t end) { + for (std::ptrdiff_t i = begin; i != end; ++i) { + const T* v = V + kv_input_chunk_length * i; + if (nullptr != present) { + // Concatenate past_V and V: (BxNx)PxH_v, (BxNx)LxH_v -> (BxNx)TxH_v + v = ConcatStateChunk(past, v, present, past_chunk_length, present_chunk_length, i); + } else if (nullptr != present_value) { + v = ConcatStateChunk(past_value, v, present_value, past_chunk_length, present_chunk_length, i); + } - T* current_tmp_data = reinterpret_cast(tmp_buffer) + q_input_chunk_length * i; - ptrdiff_t attention_probs_offset = SafeInt(sequence_length) * total_sequence_length * i; - math::MatMul(sequence_length, v_head_size, total_sequence_length, - attention_probs + attention_probs_offset, - v, current_tmp_data, nullptr); - - // Transpose: out(B, S, N, H_v) -> out_tmp(B, N, S, H_v) - const int batch_index = static_cast(i / num_heads_); - const int head_index = static_cast(i % num_heads_); - T* src = current_tmp_data; - ptrdiff_t dest_offset = (SafeInt(batch_index) * sequence_length * num_heads_ + head_index) * v_head_size; - T* dest = output + dest_offset; - for (int j = 0; j < sequence_length; j++) { - memcpy(dest, src, bytes_to_copy_trans); - src += v_head_size; - dest += v_hidden_size; - } - } - }); + T* current_tmp_data = reinterpret_cast(tmp_buffer) + q_input_chunk_length * i; + ptrdiff_t attention_probs_offset = SafeInt(sequence_length) * total_sequence_length * i; + math::MatMul(sequence_length, v_head_size, total_sequence_length, + attention_probs + attention_probs_offset, v, current_tmp_data, nullptr); + + // Transpose: out(B, S, N, H_v) -> out_tmp(B, N, S, H_v) + const int batch_index = static_cast(i / num_heads_); + const int head_index = static_cast(i % num_heads_); + T* src = current_tmp_data; + ptrdiff_t dest_offset = + (SafeInt(batch_index) * sequence_length * num_heads_ + head_index) * v_head_size; + T* dest = output + dest_offset; + for (int j = 0; j < sequence_length; j++) { + memcpy(dest, src, bytes_to_copy_trans); + src += v_head_size; + dest += v_hidden_size; + } + } + }); } }; diff --git a/onnxruntime/contrib_ops/cpu/bert/gqa_attention_base.h b/onnxruntime/contrib_ops/cpu/bert/gqa_attention_base.h index fa80efffc9ea1..6b0c5f395cab0 100644 --- a/onnxruntime/contrib_ops/cpu/bert/gqa_attention_base.h +++ b/onnxruntime/contrib_ops/cpu/bert/gqa_attention_base.h @@ -63,17 +63,16 @@ class GQAAttentionBase : public AttentionBase { bool past_present_share_buffer = past_key_data == present_key_data && past_value_data == present_value_data; const T* k = packed_qkv ? Q + num_heads_ * sequence_length * head_size : K; - ComputeAttentionProbs(static_cast(attention_probs), Q, k, - seqlens_k->Data(), - batch_size, sequence_length, seqlen_past_kv_cache, seqlen_present_kv_cache, - head_size, past_key_data, present_key_data, past_present_share_buffer, packed_qkv, tp); + ComputeAttentionProbs(static_cast(attention_probs), Q, k, seqlens_k->Data(), batch_size, + sequence_length, seqlen_past_kv_cache, seqlen_present_kv_cache, head_size, past_key_data, + present_key_data, past_present_share_buffer, packed_qkv, tp); // Compute the attentionScore * Value: out(B, N, S, H_v) = attention_probs(B, N, S, T) x V(B, N, T, H_v) const T* v = packed_qkv ? Q + (num_heads_ + kv_num_heads_) * sequence_length * head_size : V; - ComputeVxAttentionScore(output->MutableData(), static_cast(attention_probs), - v, seqlens_k->Data(), batch_size, sequence_length, seqlen_past_kv_cache, - seqlen_present_kv_cache, head_size, hidden_size, past_value_data, present_value_data, - past_present_share_buffer, packed_qkv, tp); + ComputeVxAttentionScore(output->MutableData(), static_cast(attention_probs), v, seqlens_k->Data(), + batch_size, sequence_length, seqlen_past_kv_cache, seqlen_present_kv_cache, head_size, + hidden_size, past_value_data, present_value_data, past_present_share_buffer, packed_qkv, + tp); return Status::OK(); } @@ -98,7 +97,9 @@ class GQAAttentionBase : public AttentionBase { bool packed_qkv, // whether Q, K, V are packed ThreadPool* tp) const { // thread pool const bool is_prompt = sequence_length != 1; - const int packed_batch_stride = packed_qkv ? (num_heads_ + 2 * kv_num_heads_) * sequence_length * head_size : 0; + const ptrdiff_t packed_batch_stride = + packed_qkv ? SafeInt(num_heads_ + 2 * kv_num_heads_) * sequence_length * head_size + : SafeInt(0); const int kv_num_heads_factor = num_heads_ / kv_num_heads_; const size_t q_input_chunk_length = static_cast(sequence_length) * head_size; // S x H const size_t kv_input_chunk_length = static_cast(sequence_length) * head_size; // L x H @@ -113,9 +114,12 @@ class GQAAttentionBase : public AttentionBase { const float alpha = scale_ == 0.0f ? 1.0f / sqrt(static_cast(head_size)) : scale_; TensorOpCost unit_cost; - const size_t probs_matrix_bytes = SafeInt(sequence_length) * present_buffer_sequence_length * sizeof(T); - unit_cost.compute_cycles = static_cast(2 * sequence_length * head_size * present_buffer_sequence_length); - unit_cost.bytes_loaded = static_cast((sequence_length + present_buffer_sequence_length) * head_size * sizeof(T)); + const ptrdiff_t probs_matrix_bytes = + SafeInt(sequence_length) * present_buffer_sequence_length * sizeof(T); + unit_cost.compute_cycles = + static_cast(SafeInt(2) * sequence_length * head_size * present_buffer_sequence_length); + unit_cost.bytes_loaded = + static_cast((sequence_length + present_buffer_sequence_length) * head_size * sizeof(T)); unit_cost.bytes_stored = static_cast(probs_matrix_bytes); unit_cost.bytes_loaded += static_cast(probs_matrix_bytes); @@ -131,11 +135,12 @@ class GQAAttentionBase : public AttentionBase { for (std::ptrdiff_t i = begin; i != end; ++i) { const int batch_index = static_cast(i) / num_heads_; const int head_index = static_cast(i) % num_heads_; - const int past_seqlen = sequence_length == 1 ? static_cast(seqlens_k[batch_index]) : past_buffer_sequence_length; + const int past_seqlen = + sequence_length == 1 ? static_cast(seqlens_k[batch_index]) : past_buffer_sequence_length; const size_t past_chunk_length = static_cast(past_seqlen) * head_size; const int total_seqlen = seqlens_k[batch_index] + 1; - const int output_offset = static_cast(i) * sequence_length * present_buffer_sequence_length; + const ptrdiff_t output_offset = SafeInt(i) * sequence_length * present_buffer_sequence_length; T* output = attention_probs + output_offset; const T* k; @@ -161,11 +166,9 @@ class GQAAttentionBase : public AttentionBase { } else { q = Q + q_input_chunk_length * i; } - math::GemmEx(CblasNoTrans, CblasTrans, - sequence_length, total_seqlen, head_size, alpha, - q, head_size, k, head_size, - 0.0f /*bata*/, - output, present_buffer_sequence_length, nullptr); + math::GemmEx(CblasNoTrans, CblasTrans, sequence_length, total_seqlen, head_size, alpha, q, + head_size, k, head_size, 0.0f /*bata*/, output, present_buffer_sequence_length, + nullptr); // compute Softmax T* output_softmax = output; @@ -175,7 +178,8 @@ class GQAAttentionBase : public AttentionBase { for (int total_seq_id = 0; total_seq_id < seq_causal_length - local_window_size_ - 1; total_seq_id++) { output_softmax[total_seq_id] = 0.f; } - ComputeAttentionSoftmaxInplace(output_softmax + seq_causal_length - local_window_size_ - 1, 1, local_window_size_ + 1, nullptr); + ComputeAttentionSoftmaxInplace(output_softmax + seq_causal_length - local_window_size_ - 1, 1, + local_window_size_ + 1, nullptr); } else { ComputeAttentionSoftmaxInplace(output_softmax, 1, seq_causal_length, nullptr); } @@ -208,7 +212,9 @@ class GQAAttentionBase : public AttentionBase { bool packed_qkv, // whether Q, K, V are packed ThreadPool* tp) const { const bool is_prompt = sequence_length != 1; - const int packed_batch_stride = packed_qkv ? (num_heads_ + 2 * kv_num_heads_) * sequence_length * head_size : 0; + const ptrdiff_t packed_batch_stride = + packed_qkv ? SafeInt(num_heads_ + 2 * kv_num_heads_) * sequence_length * head_size + : SafeInt(0); const int kv_num_heads_factor = num_heads_ / kv_num_heads_; const int kv_input_chunk_length = sequence_length * head_size; // L x H const size_t past_buff_chunk_length = static_cast(past_buffer_sequence_length) * head_size; // L x H @@ -220,8 +226,10 @@ class GQAAttentionBase : public AttentionBase { // The cost of Gemm TensorOpCost unit_cost; - unit_cost.compute_cycles = static_cast(2 * sequence_length * head_size * present_buffer_sequence_length); - unit_cost.bytes_loaded = static_cast((sequence_length + head_size) * present_buffer_sequence_length * sizeof(T)); + unit_cost.compute_cycles = + static_cast(SafeInt(2) * sequence_length * head_size * present_buffer_sequence_length); + unit_cost.bytes_loaded = static_cast(SafeInt(sequence_length + head_size) * + present_buffer_sequence_length * sizeof(T)); unit_cost.bytes_stored = static_cast(sequence_length * head_size * sizeof(T)); if (present_value) { @@ -235,39 +243,37 @@ class GQAAttentionBase : public AttentionBase { unit_cost.bytes_loaded += bytes_to_copy_trans_all; unit_cost.bytes_stored += bytes_to_copy_trans_all; - ThreadPool::TryParallelFor(tp, SafeInt(batch_size) * num_heads_, unit_cost, [&](std::ptrdiff_t begin, std::ptrdiff_t end) { - for (std::ptrdiff_t i = begin; i != end; ++i) { - const int batch_index = static_cast(i / num_heads_); - const int head_index = static_cast(i % num_heads_); - const int past_seqlen = sequence_length == 1 ? static_cast(seqlens_k[batch_index]) : past_buffer_sequence_length; - const size_t past_chunk_length = static_cast(past_seqlen) * head_size; - const int total_seqlen = seqlens_k[batch_index] + 1; + ThreadPool::TryParallelFor( + tp, SafeInt(batch_size) * num_heads_, unit_cost, [&](std::ptrdiff_t begin, std::ptrdiff_t end) { + for (std::ptrdiff_t i = begin; i != end; ++i) { + const int batch_index = static_cast(i / num_heads_); + const int head_index = static_cast(i % num_heads_); + const int past_seqlen = + sequence_length == 1 ? static_cast(seqlens_k[batch_index]) : past_buffer_sequence_length; + const size_t past_chunk_length = static_cast(past_seqlen) * head_size; + const int total_seqlen = seqlens_k[batch_index] + 1; + + const T* v; + if (packed_qkv) { + v = V + packed_batch_stride * batch_index + kv_input_chunk_length * (head_index / kv_num_heads_factor); + } else { + v = V + kv_input_chunk_length * (i / kv_num_heads_factor); + } + if (nullptr != present_value) { + v = ConcatStateChunkGQA(past_value, v, present_value, present_buff_chunk_length, past_buff_chunk_length, + past_chunk_length, kv_input_chunk_length, is_prompt, past_present_share_buffer, + i / kv_num_heads_factor); + } - const T* v; - if (packed_qkv) { - v = V + packed_batch_stride * batch_index + kv_input_chunk_length * (head_index / kv_num_heads_factor); - } else { - v = V + kv_input_chunk_length * (i / kv_num_heads_factor); - } - if (nullptr != present_value) { - v = ConcatStateChunkGQA(past_value, v, present_value, present_buff_chunk_length, past_buff_chunk_length, - past_chunk_length, kv_input_chunk_length, is_prompt, past_present_share_buffer, - i / kv_num_heads_factor); - } + T* output_current = output + (batch_index * sequence_length * num_heads_ + head_index) * head_size; + ptrdiff_t attention_probs_offset = SafeInt(sequence_length) * present_buffer_sequence_length * i; - T* output_current = output + (batch_index * sequence_length * num_heads_ + head_index) * head_size; - ptrdiff_t attention_probs_offset = SafeInt(sequence_length) * present_buffer_sequence_length * i; - - math::GemmEx(CblasNoTrans, - CblasNoTrans, - sequence_length, head_size, total_seqlen, - 1.f, /*alpha*/ - attention_probs + attention_probs_offset, present_buffer_sequence_length, - v, head_size, - 0.0f /*beta*/, - output_current, hidden_size, nullptr); - } - }); + math::GemmEx(CblasNoTrans, CblasNoTrans, sequence_length, head_size, total_seqlen, + 1.f, /*alpha*/ + attention_probs + attention_probs_offset, present_buffer_sequence_length, v, + head_size, 0.0f /*beta*/, output_current, hidden_size, nullptr); + } + }); } }; diff --git a/onnxruntime/contrib_ops/js/js_contrib_kernels.cc b/onnxruntime/contrib_ops/js/js_contrib_kernels.cc index 9d8f79c67d8a4..7bc3414c89978 100644 --- a/onnxruntime/contrib_ops/js/js_contrib_kernels.cc +++ b/onnxruntime/contrib_ops/js/js_contrib_kernels.cc @@ -16,6 +16,7 @@ class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSDomain, 1, Gelu); class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSDomain, 1, GroupQueryAttention); class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSDomain, 1, MatMulNBits); class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSDomain, 1, MultiHeadAttention); +class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSDomain, 1, QuickGelu); class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSDomain, 1, RotaryEmbedding); class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kMSDomain, 1, SkipLayerNormalization); class ONNX_OPERATOR_KERNEL_CLASS_NAME(kJsExecutionProvider, kOnnxDomain, 1, SimplifiedLayerNormalization); @@ -38,6 +39,7 @@ Status RegisterJsContribKernels(KernelRegistry& kernel_registry) { BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, + BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, diff --git a/onnxruntime/contrib_ops/js/quick_gelu.cc b/onnxruntime/contrib_ops/js/quick_gelu.cc new file mode 100644 index 0000000000000..4bb4d5afd4109 --- /dev/null +++ b/onnxruntime/contrib_ops/js/quick_gelu.cc @@ -0,0 +1,23 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include "quick_gelu.h" + +namespace onnxruntime { +namespace contrib { +namespace js { + +using onnxruntime::js::JsepSupportedFloatTypes; + +ONNX_OPERATOR_KERNEL_EX( + QuickGelu, + kMSDomain, + 1, + kJsExecutionProvider, + (*KernelDefBuilder::Create()) + .TypeConstraint("T", JsepSupportedFloatTypes()), + QuickGelu); + +} // namespace js +} // namespace contrib +} // namespace onnxruntime diff --git a/onnxruntime/contrib_ops/js/quick_gelu.h b/onnxruntime/contrib_ops/js/quick_gelu.h new file mode 100644 index 0000000000000..51e39e2718d51 --- /dev/null +++ b/onnxruntime/contrib_ops/js/quick_gelu.h @@ -0,0 +1,24 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#pragma once + +#include "core/providers/js/js_kernel.h" + +namespace onnxruntime { +namespace contrib { +namespace js { + +using onnxruntime::js::JsKernel; + +class QuickGelu final : public JsKernel { + public: + explicit QuickGelu(const OpKernelInfo& info) : JsKernel(info) { + float alpha = info.GetAttrOrDefault("alpha", 1.0); + JSEP_INIT_KERNEL_ATTRIBUTE(QuickGelu, ({"alpha" : $1}), alpha); + } +}; + +} // namespace js +} // namespace contrib +} // namespace onnxruntime diff --git a/onnxruntime/core/mlas/lib/power/qgemm_kernel_power10.cpp b/onnxruntime/core/mlas/lib/power/qgemm_kernel_power10.cpp index 633349e800875..a67be1dbfa710 100644 --- a/onnxruntime/core/mlas/lib/power/qgemm_kernel_power10.cpp +++ b/onnxruntime/core/mlas/lib/power/qgemm_kernel_power10.cpp @@ -67,7 +67,7 @@ MlasGemmQuantFixupZeroPointB( } -template +template void MlasGemmQuantCopyPackA8x8( MLAS_GEMM_QUANT_KERNEL_POWER10::PackedAType* D, @@ -75,11 +75,10 @@ MlasGemmQuantCopyPackA8x8( size_t lda, size_t CountM, size_t CountK, - int32_t* RowSumBuffer, - bool AIsSigned + int32_t* RowSumBuffer ) { - const uint8_t Flip = (AIsSigned ? 0 : 0x80); + constexpr uint8_t Flip = (AIsSigned ? 0 : 0x80); Vtype vmask = reinterpret_cast(vec_splats(Flip)); typedef __vector signed char vec_t; @@ -106,66 +105,74 @@ MlasGemmQuantCopyPackA8x8( Vtype a3 = *reinterpret_cast(&a[lda * 2]); Vtype a4 = *reinterpret_cast(&a[lda * 3]); Vtype vx = - reinterpret_cast(vec_mergee (reinterpret_cast<__vector int>(a1), + reinterpret_cast(vec_mergee(reinterpret_cast<__vector int>(a1), reinterpret_cast<__vector int>(a2))); Vtype vx1 = - reinterpret_cast(vec_mergee (reinterpret_cast<__vector int>(a3), + reinterpret_cast(vec_mergee(reinterpret_cast<__vector int>(a3), reinterpret_cast<__vector int>(a4))); Vtype vx2 = - reinterpret_cast(vec_mergeo (reinterpret_cast<__vector int>(a1), + reinterpret_cast(vec_mergeo(reinterpret_cast<__vector int>(a1), reinterpret_cast<__vector int>(a2))); Vtype vx3 = - reinterpret_cast(vec_mergeo (reinterpret_cast<__vector int>(a3), + reinterpret_cast(vec_mergeo(reinterpret_cast<__vector int>(a3), reinterpret_cast<__vector int>(a4))); - Vtype vx4 = vec_xxpermdi (vx, vx1, 0); - Vtype vx5 = vec_xxpermdi (vx2, vx3, 0); - Vtype vx6 = vec_xxpermdi (vx, vx1, 3); - Vtype vx7 = vec_xxpermdi (vx2, vx3, 3); + Vtype vx4 = vec_xxpermdi(vx, vx1, 0); + Vtype vx5 = vec_xxpermdi(vx2, vx3, 0); + Vtype vx6 = vec_xxpermdi(vx, vx1, 3); + Vtype vx7 = vec_xxpermdi(vx2, vx3, 3); a1 = *reinterpret_cast(&a[lda*4]); a2 = *reinterpret_cast(&a[lda*5]); a3 = *reinterpret_cast(&a[lda*6]); a4 = *reinterpret_cast(&a[lda*7]); vx = - reinterpret_cast(vec_mergee (reinterpret_cast<__vector int>(a1), + reinterpret_cast(vec_mergee(reinterpret_cast<__vector int>(a1), reinterpret_cast<__vector int>(a2))); vx1 = - reinterpret_cast(vec_mergee (reinterpret_cast<__vector int>(a3), + reinterpret_cast(vec_mergee(reinterpret_cast<__vector int>(a3), reinterpret_cast<__vector int>(a4))); vx2 = - reinterpret_cast(vec_mergeo (reinterpret_cast<__vector int>(a1), + reinterpret_cast(vec_mergeo(reinterpret_cast<__vector int>(a1), reinterpret_cast<__vector int>(a2))); vx3 = - reinterpret_cast(vec_mergeo (reinterpret_cast<__vector int>(a3), + reinterpret_cast(vec_mergeo(reinterpret_cast<__vector int>(a3), reinterpret_cast<__vector int>(a4))); - Vtype vx8 = vec_xxpermdi (vx, vx1, 0); - Vtype vx9 = vec_xxpermdi (vx2, vx3, 0); - Vtype vx10 = vec_xxpermdi (vx, vx1, 3); - Vtype vx11 = vec_xxpermdi (vx2, vx3, 3); + Vtype vx8 = vec_xxpermdi(vx, vx1, 0); + Vtype vx9 = vec_xxpermdi(vx2, vx3, 0); + Vtype vx10 = vec_xxpermdi(vx, vx1, 3); + Vtype vx11 = vec_xxpermdi(vx2, vx3, 3); vec_t vxx = - reinterpret_cast(vec_sub (vx4, vmask)); - vsum = vec_sum4s (vxx, vsum); + AIsSigned ? reinterpret_cast(vx4) : + reinterpret_cast(vec_sub(vx4, vmask)); + vsum = vec_sum4s(vxx, vsum); *reinterpret_cast(&D[0]) = vxx; - vxx = reinterpret_cast(vec_sub (vx5, vmask)); - vsum = vec_sum4s (vxx, vsum); + vxx = AIsSigned ? reinterpret_cast(vx5) : + reinterpret_cast(vec_sub(vx5, vmask)); + vsum = vec_sum4s(vxx, vsum); *reinterpret_cast(&D[16]) = vxx; - vxx = reinterpret_cast(vec_sub (vx6, vmask)); - vsum = vec_sum4s (vxx, vsum); + vxx = AIsSigned ? reinterpret_cast(vx6) : + reinterpret_cast(vec_sub(vx6, vmask)); + vsum = vec_sum4s(vxx, vsum); *reinterpret_cast(&D[32]) = vxx; - vxx = reinterpret_cast(vec_sub (vx7, vmask)); - vsum = vec_sum4s (vxx, vsum); + vxx = AIsSigned ? reinterpret_cast(vx7) : + reinterpret_cast(vec_sub(vx7, vmask)); + vsum = vec_sum4s(vxx, vsum); *reinterpret_cast(&D[48]) = vxx; - vxx = reinterpret_cast(vec_sub (vx8, vmask)); + vxx = AIsSigned ? reinterpret_cast(vx8) : + reinterpret_cast(vec_sub(vx8, vmask)); *reinterpret_cast(&D[64]) = vxx; - vsum2 = vec_sum4s (vxx, vsum2); - vxx = reinterpret_cast(vec_sub (vx9, vmask)); + vsum2 = vec_sum4s(vxx, vsum2); + vxx = AIsSigned ? reinterpret_cast(vx9) : + reinterpret_cast(vec_sub(vx9, vmask)); *reinterpret_cast(&D[80]) = vxx; - vsum2 = vec_sum4s (vxx, vsum2); - vxx = reinterpret_cast(vec_sub (vx10, vmask)); + vsum2 = vec_sum4s(vxx, vsum2); + vxx = AIsSigned ? reinterpret_cast(vx10) : + reinterpret_cast(vec_sub(vx10, vmask)); *reinterpret_cast(&D[96]) = vxx; - vsum2 = vec_sum4s (vxx, vsum2); - vxx = reinterpret_cast(vec_sub (vx11, vmask)); + vsum2 = vec_sum4s(vxx, vsum2); + vxx = AIsSigned ? reinterpret_cast(vx11) : + reinterpret_cast(vec_sub(vx11, vmask)); *reinterpret_cast(&D[112]) = vxx; - vsum2 = vec_sum4s (vxx, vsum2); + vsum2 = vec_sum4s(vxx, vsum2); D += 16 * 8; a += 16; y -= 16; @@ -179,16 +186,18 @@ MlasGemmQuantCopyPackA8x8( int a4 = *reinterpret_cast(&a[lda*3]); __vector int vx1 = { a1, a2, a3, a4}; vec_t vx = - reinterpret_cast(vec_sub (reinterpret_cast(vx1), vmask)); - vsum = vec_sum4s (vx, vsum); + AIsSigned ? reinterpret_cast(vx1) : + reinterpret_cast(vec_sub(reinterpret_cast(vx1), vmask)); + vsum = vec_sum4s(vx, vsum); *reinterpret_cast(&D[0]) = vx; a1 = *reinterpret_cast(&a[lda*4]); a2 = *reinterpret_cast(&a[lda*5]); a3 = *reinterpret_cast(&a[lda*6]); a4 = *reinterpret_cast(&a[lda*7]); __vector int vx2 = { a1, a2, a3, a4}; - vx = reinterpret_cast(vec_sub (reinterpret_cast(vx2), vmask)); - vsum2 = vec_sum4s (vx, vsum2); + vx = AIsSigned ? reinterpret_cast(vx2) : + reinterpret_cast(vec_sub(reinterpret_cast(vx2), vmask)); + vsum2 = vec_sum4s(vx, vsum2); if (CountK & 3) { if (yval >= 12) { *reinterpret_cast(&D[64]) = vx; @@ -225,10 +234,10 @@ MlasGemmQuantCopyPackA8x8( } if (y >= 1) { - Vtype a1 = reinterpret_cast(vec_splats(Flip)); - Vtype a2 = reinterpret_cast(vec_splats(Flip)); - Vtype a3 = reinterpret_cast(vec_splats(Flip)); - Vtype a4 = reinterpret_cast(vec_splats(Flip)); + Vtype a1 = vmask; + Vtype a2 = vmask; + Vtype a3 = vmask; + Vtype a4 = vmask; a1[0] = a[0]; a2[0] = a[lda]; a3[0] = a[lda * 2]; @@ -246,20 +255,21 @@ MlasGemmQuantCopyPackA8x8( a4[2] = a[lda * 3 + 2]; } Vtype vx = - reinterpret_cast(vec_mergee (reinterpret_cast<__vector int>(a1), + reinterpret_cast(vec_mergee(reinterpret_cast<__vector int>(a1), reinterpret_cast<__vector int>(a2))); Vtype vx1 = - reinterpret_cast(vec_mergee (reinterpret_cast<__vector int>(a3), + reinterpret_cast(vec_mergee(reinterpret_cast<__vector int>(a3), reinterpret_cast<__vector int>(a4))); - Vtype vx2 = vec_xxpermdi (vx, vx1, 0); + Vtype vx2 = vec_xxpermdi(vx, vx1, 0); vec_t vx3 = - reinterpret_cast(vec_sub (vx2, vmask)); - vsum = vec_sum4s (vx3, vsum); + AIsSigned ? reinterpret_cast(vx2) : + reinterpret_cast(vec_sub(vx2, vmask)); + vsum = vec_sum4s(vx3, vsum); *reinterpret_cast(&D[0]) = vx3; - a1 = reinterpret_cast(vec_splats(Flip)); - a2 = reinterpret_cast(vec_splats(Flip)); - a3 = reinterpret_cast(vec_splats(Flip)); - a4 = reinterpret_cast(vec_splats(Flip)); + a1 = vmask; + a2 = vmask; + a3 = vmask; + a4 = vmask; a1[0] = a[lda * 4]; a2[0] = a[lda * 5]; a3[0] = a[lda * 6]; @@ -277,14 +287,15 @@ MlasGemmQuantCopyPackA8x8( a4[2] = a[lda * 7 + 2]; } vx = - reinterpret_cast(vec_mergee (reinterpret_cast<__vector int>(a1), + reinterpret_cast(vec_mergee(reinterpret_cast<__vector int>(a1), reinterpret_cast<__vector int>(a2))); vx1 = - reinterpret_cast(vec_mergee (reinterpret_cast<__vector int>(a3), + reinterpret_cast(vec_mergee(reinterpret_cast<__vector int>(a3), reinterpret_cast<__vector int>(a4))); - vx2 = vec_xxpermdi (vx, vx1, 0); - vx3 = reinterpret_cast(vec_sub (vx2, vmask)); - vsum2 = vec_sum4s (vx3, vsum2); + vx2 = vec_xxpermdi(vx, vx1, 0); + vx3 = AIsSigned ? reinterpret_cast(vx2) : + reinterpret_cast(vec_sub(vx2, vmask)); + vsum2 = vec_sum4s(vx3, vsum2); if (CountK % 16 >= 12) { *reinterpret_cast(&D[64]) = vx3; D += 80; @@ -327,34 +338,38 @@ MlasGemmQuantCopyPackA8x8( Vtype a3 = *reinterpret_cast(&a[lda * 2]); Vtype a4 = *reinterpret_cast(&a[lda * 3]); Vtype vx = - reinterpret_cast(vec_mergee (reinterpret_cast<__vector int>(a1), + reinterpret_cast(vec_mergee(reinterpret_cast<__vector int>(a1), reinterpret_cast<__vector int>(a2))); Vtype vx1 = - reinterpret_cast(vec_mergee (reinterpret_cast<__vector int>(a3), + reinterpret_cast(vec_mergee(reinterpret_cast<__vector int>(a3), reinterpret_cast<__vector int>(a4))); Vtype vx2 = - reinterpret_cast(vec_mergeo (reinterpret_cast<__vector int>(a1), + reinterpret_cast(vec_mergeo(reinterpret_cast<__vector int>(a1), reinterpret_cast<__vector int>(a2))); Vtype vx3 = - reinterpret_cast(vec_mergeo (reinterpret_cast<__vector int>(a3), + reinterpret_cast(vec_mergeo(reinterpret_cast<__vector int>(a3), reinterpret_cast<__vector int>(a4))); - Vtype vx4 = vec_xxpermdi (vx, vx1, 0); - Vtype vx5 = vec_xxpermdi (vx2, vx3, 0); - Vtype vx6 = vec_xxpermdi (vx, vx1, 3); - Vtype vx7 = vec_xxpermdi (vx2, vx3, 3); + Vtype vx4 = vec_xxpermdi(vx, vx1, 0); + Vtype vx5 = vec_xxpermdi(vx2, vx3, 0); + Vtype vx6 = vec_xxpermdi(vx, vx1, 3); + Vtype vx7 = vec_xxpermdi(vx2, vx3, 3); vec_t vx0 = - reinterpret_cast(vec_sub (vx4, vmask)); + AIsSigned ? reinterpret_cast(vx4) : + reinterpret_cast(vec_sub(vx4, vmask)); *reinterpret_cast(&D[0]) = vx0; - vsum = vec_sum4s (vx0, vsum); - vx0 = reinterpret_cast(vec_sub (vx5, vmask)); + vsum = vec_sum4s(vx0, vsum); + vx0 = AIsSigned ? reinterpret_cast(vx5) : + reinterpret_cast(vec_sub(vx5, vmask)); *reinterpret_cast(&D[16]) = vx0; - vsum = vec_sum4s (vx0, vsum); - vx0 = reinterpret_cast(vec_sub (vx6, vmask)); + vsum = vec_sum4s(vx0, vsum); + vx0 = AIsSigned ? reinterpret_cast(vx6) : + reinterpret_cast(vec_sub(vx6, vmask)); *reinterpret_cast(&D[32]) = vx0; - vsum = vec_sum4s (vx0, vsum); - vx0 = reinterpret_cast(vec_sub (vx7, vmask)); + vsum = vec_sum4s(vx0, vsum); + vx0 = AIsSigned ? reinterpret_cast(vx7) : + reinterpret_cast(vec_sub(vx7, vmask)); *reinterpret_cast(&D[48]) = vx0; - vsum = vec_sum4s (vx0, vsum); + vsum = vec_sum4s(vx0, vsum); D += 16 * 4; a += 16; y -= 16; @@ -367,16 +382,17 @@ MlasGemmQuantCopyPackA8x8( int a4 = *reinterpret_cast(&a[lda*3]); __vector int vx1 = { a1, a2, a3, a4}; vec_t vx = - reinterpret_cast(vec_sub (reinterpret_cast(vx1), vmask)); + AIsSigned ? reinterpret_cast(vx1) : + reinterpret_cast(vec_sub(reinterpret_cast(vx1), vmask)); *reinterpret_cast(&D[0]) = vx; - vsum = vec_sum4s (vx, vsum); + vsum = vec_sum4s(vx, vsum); D += 16; a += 4; y -= 4; } if (y >= 1) { - Vtype vx = reinterpret_cast(vec_splats(Flip)); + Vtype vx = vmask; vx[0] = a[0]; vx[4] = a[lda]; vx[8] = a[lda * 2]; @@ -394,9 +410,10 @@ MlasGemmQuantCopyPackA8x8( vx[14] = a[lda * 3 + 2]; } vec_t vx1 = - reinterpret_cast(vec_sub (vx, vmask)); + AIsSigned ? reinterpret_cast(vx) : + reinterpret_cast(vec_sub(vx, vmask)); *reinterpret_cast(&D[0]) = vx1; - vsum = vec_sum4s (vx1, vsum); + vsum = vec_sum4s(vx1, vsum); D += 16; a += 16; } @@ -416,9 +433,9 @@ MlasGemmQuantCopyPackA8x8( __vector signed int vsum = { 0 }; while (y >= 16) { - Vtype a4 = reinterpret_cast(vec_splats(Flip)); - Vtype a2 = reinterpret_cast(vec_splats(Flip)); - Vtype a3 = reinterpret_cast(vec_splats(Flip)); + Vtype a4 = vmask; + Vtype a2 = vmask; + Vtype a3 = vmask; Vtype a1 = *reinterpret_cast(&a[0]); if (CountM == 3) { a3 = *reinterpret_cast(&a[lda * 2]); @@ -427,53 +444,58 @@ MlasGemmQuantCopyPackA8x8( a2 = *reinterpret_cast(&a[lda]); } Vtype vx = - reinterpret_cast(vec_mergee (reinterpret_cast<__vector int>(a1), + reinterpret_cast(vec_mergee(reinterpret_cast<__vector int>(a1), reinterpret_cast<__vector int>(a2))); Vtype vx1 = - reinterpret_cast(vec_mergee (reinterpret_cast<__vector int>(a3), + reinterpret_cast(vec_mergee(reinterpret_cast<__vector int>(a3), reinterpret_cast<__vector int>(a4))); Vtype vx2 = - reinterpret_cast(vec_mergeo (reinterpret_cast<__vector int>(a1), + reinterpret_cast(vec_mergeo(reinterpret_cast<__vector int>(a1), reinterpret_cast<__vector int>(a2))); Vtype vx3 = - reinterpret_cast(vec_mergeo (reinterpret_cast<__vector int>(a3), + reinterpret_cast(vec_mergeo(reinterpret_cast<__vector int>(a3), reinterpret_cast<__vector int>(a4))); - Vtype vx4 = vec_xxpermdi (vx, vx1, 0); - Vtype vx5 = vec_xxpermdi (vx2, vx3, 0); - Vtype vx6 = vec_xxpermdi (vx, vx1, 3); - Vtype vx7 = vec_xxpermdi (vx2, vx3, 3); + Vtype vx4 = vec_xxpermdi(vx, vx1, 0); + Vtype vx5 = vec_xxpermdi(vx2, vx3, 0); + Vtype vx6 = vec_xxpermdi(vx, vx1, 3); + Vtype vx7 = vec_xxpermdi(vx2, vx3, 3); vec_t vx0 = - reinterpret_cast(vec_sub (vx4, vmask)); + AIsSigned ? reinterpret_cast(vx4) : + reinterpret_cast(vec_sub(vx4, vmask)); *reinterpret_cast(&D[0]) = vx0; - vsum = vec_sum4s (vx0, vsum); - vx0 = reinterpret_cast(vec_sub (vx5, vmask)); + vsum = vec_sum4s(vx0, vsum); + vx0 = AIsSigned ? reinterpret_cast(vx5) : + reinterpret_cast(vec_sub(vx5, vmask)); *reinterpret_cast(&D[16]) = vx0; - vsum = vec_sum4s (vx0, vsum); - vx0 = reinterpret_cast(vec_sub (vx6, vmask)); + vsum = vec_sum4s(vx0, vsum); + vx0 = AIsSigned ? reinterpret_cast(vx6) : + reinterpret_cast(vec_sub(vx6, vmask)); *reinterpret_cast(&D[32]) = vx0; - vsum = vec_sum4s (vx0, vsum); - vx0 = reinterpret_cast(vec_sub (vx7, vmask)); + vsum = vec_sum4s(vx0, vsum); + vx0 = AIsSigned ? reinterpret_cast(vx7) : + reinterpret_cast(vec_sub(vx7, vmask)); *reinterpret_cast(&D[48]) = vx0; - vsum = vec_sum4s (vx0, vsum); + vsum = vec_sum4s(vx0, vsum); D += 16 * 4; a += 16; y -= 16; } while (y >= 4) { - Vtype vb = reinterpret_cast(vec_splats(Flip)); + Vtype vb = vmask; __vector int vx1 = reinterpret_cast<__vector int>(vb); vx1[0] = *reinterpret_cast(&a[0]); - if(CountM >= 2) { + if (CountM >= 2) { vx1[1] = *reinterpret_cast(&a[lda]); } - if(CountM >= 3) { + if (CountM >= 3) { vx1[2] = *reinterpret_cast(&a[lda*2]); } vec_t vx = - reinterpret_cast(vec_sub (reinterpret_cast(vx1), vmask)); + AIsSigned ? reinterpret_cast(vx1) : + reinterpret_cast(vec_sub(reinterpret_cast(vx1), vmask)); *reinterpret_cast(&D[0]) = vx; - vsum = vec_sum4s (vx, vsum); + vsum = vec_sum4s(vx, vsum); D += 16; a += 4; y -= 4; @@ -508,7 +530,7 @@ MlasGemmQuantCopyPackA8x8( } } *reinterpret_cast(&D[0]) = vx; - vsum = vec_sum4s (vx, vsum); + vsum = vec_sum4s(vx, vsum); D += 16; } *RowSumBuffer++ = vsum[0]; @@ -521,7 +543,7 @@ MlasGemmQuantCopyPackA8x8( } } -template +template void MlasGemmQuantCopyPackB8x8( MLAS_GEMM_QUANT_KERNEL_POWER10::PackedBType* D, @@ -529,29 +551,128 @@ MlasGemmQuantCopyPackB8x8( size_t ldb, size_t CountN, size_t CountK, - int32_t* ColumnSumBuffer, - bool BIsSigned + int32_t* ColumnSumBuffer ) { - const uint8_t BitFlipValue = (BIsSigned ? 0x80 : 0); + [[maybe_unused]] constexpr uint8_t BitFlipValue = (BIsSigned ? 0x80 : 0); typedef __vector unsigned char vec_t; Vtype vmask = reinterpret_cast(vec_splats(BitFlipValue)); vec_t mask = {0,4,8,12,1,5,9,13,2,6,10,14,3,7,11,15}; - const int8_t Flip = (BIsSigned ? -128 : 0); - // Process 4 columns of matrix B in a loop. - // // Copy columns from matrix B to the packed buffer. Signed buffers are // converted to unsigned buffers in order to share a common kernel. // // If CountK is not aligned to a multiple of four, then the packed buffer // is padded with zero vectors. - while (CountN >= 4) { + // Process 16 columns of matrix B in a loop. + // + size_t PackedK = ((CountK + 4 - 1) / 4) * 16; + size_t k2 = PackedK; + size_t k3 = PackedK*2; + size_t k4 = PackedK*3; + + while (CountN >= 16) { const uint8_t* b = B; __vector unsigned int vsum = {0}; + __vector unsigned int vsum2 = {0}; + __vector unsigned int vsum3 = {0}; + __vector unsigned int vsum4 = {0}; size_t y = CountK; - if(y >= 4) { + if (y >= 4) { + do { + Vtype b1 = *reinterpret_cast(&b[0]); + Vtype b2 = *reinterpret_cast(&b[ldb]); + Vtype b3 = *reinterpret_cast(&b[ldb*2]); + Vtype b4 = *reinterpret_cast(&b[ldb*3]); + Vtype t1 = vec_mergeh(b1, b3); + Vtype t2 = vec_mergel(b1, b3); + Vtype t3 = vec_mergeh(b2, b4); + Vtype t4 = vec_mergel(b2, b4); + b1 = vec_mergeh(t1, t3); + b2 = vec_mergel(t1, t3); + b3 = vec_mergeh(t2, t4); + b4 = vec_mergel(t2, t4); + vec_t vx1 = BIsSigned ? reinterpret_cast(vec_add(b1, vmask)) : + reinterpret_cast(b1); + vec_t vx2 = BIsSigned ? reinterpret_cast(vec_add(b2, vmask)) : + reinterpret_cast(b2); + vec_t vx3 = BIsSigned ? reinterpret_cast(vec_add(b3, vmask)) : + reinterpret_cast(b3); + vec_t vx4 = BIsSigned ? reinterpret_cast(vec_add(b4, vmask)) : + reinterpret_cast(b4); + *reinterpret_cast(&D[0]) = vx1; + *reinterpret_cast(&D[k2]) = vx2; + *reinterpret_cast(&D[k3]) = vx3; + *reinterpret_cast(&D[k4]) = vx4; + vsum = vec_sum4s(vx1, vsum); + vsum2 = vec_sum4s(vx2, vsum2); + vsum3 = vec_sum4s(vx3, vsum3); + vsum4 = vec_sum4s(vx4, vsum4); + D += 16; + b += ldb*4; + y -= 4; + } while (y >= 4); + } + if (y >= 1) { + Vtype b1 = *reinterpret_cast(&b[0]); + Vtype b2 = (y >= 2) ? *reinterpret_cast(&b[ldb]) : vmask; + Vtype b3 = (y >= 3) ? *reinterpret_cast(&b[ldb*2]) : vmask; + Vtype b4 = vmask; + Vtype t1 = vec_mergeh(b1, b3); + Vtype t2 = vec_mergel(b1, b3); + Vtype t3 = vec_mergeh(b2, b4); + Vtype t4 = vec_mergel(b2, b4); + b1 = vec_mergeh(t1, t3); + b2 = vec_mergel(t1, t3); + b3 = vec_mergeh(t2, t4); + b4 = vec_mergel(t2, t4); + vec_t vx1 = BIsSigned ? reinterpret_cast(vec_add(b1, vmask)) : + reinterpret_cast(b1); + vec_t vx2 = BIsSigned ? reinterpret_cast(vec_add(b2, vmask)) : + reinterpret_cast(b2); + vec_t vx3 = BIsSigned ? reinterpret_cast(vec_add(b3, vmask)) : + reinterpret_cast(b3); + vec_t vx4 = BIsSigned ? reinterpret_cast(vec_add(b4, vmask)) : + reinterpret_cast(b4); + *reinterpret_cast(&D[0]) = vx1; + *reinterpret_cast(&D[k2]) = vx2; + *reinterpret_cast(&D[k3]) = vx3; + *reinterpret_cast(&D[k4]) = vx4; + vsum = vec_sum4s(vx1, vsum); + vsum2 = vec_sum4s(vx2, vsum2); + vsum3 = vec_sum4s(vx3, vsum3); + vsum4 = vec_sum4s(vx4, vsum4); + D += 16; + } + *ColumnSumBuffer++ = vsum[0]; + *ColumnSumBuffer++ = vsum[1]; + *ColumnSumBuffer++ = vsum[2]; + *ColumnSumBuffer++ = vsum[3]; + *ColumnSumBuffer++ = vsum2[0]; + *ColumnSumBuffer++ = vsum2[1]; + *ColumnSumBuffer++ = vsum2[2]; + *ColumnSumBuffer++ = vsum2[3]; + *ColumnSumBuffer++ = vsum3[0]; + *ColumnSumBuffer++ = vsum3[1]; + *ColumnSumBuffer++ = vsum3[2]; + *ColumnSumBuffer++ = vsum3[3]; + *ColumnSumBuffer++ = vsum4[0]; + *ColumnSumBuffer++ = vsum4[1]; + *ColumnSumBuffer++ = vsum4[2]; + *ColumnSumBuffer++ = vsum4[3]; + B += 16; + CountN -= 16; + D += k4; + } + + // Process four columns of matrix B in a loop. + // + while (CountN >= 4) { + const uint8_t* b = B; + __vector unsigned int vsum = {0}; + size_t y = CountK; + if (y >= 4) { do { int b1 = *reinterpret_cast(&b[0]); int b2 = *reinterpret_cast(&b[ldb]); @@ -559,28 +680,30 @@ MlasGemmQuantCopyPackB8x8( int b4 = *reinterpret_cast(&b[ldb*3]); __vector int vb = {b1, b2, b3, b4}; Vtype vx = vec_perm(reinterpret_cast(vb), reinterpret_cast(vb), mask); - vec_t vx1 = reinterpret_cast(vec_add (vx, vmask)); + vec_t vx1 = BIsSigned ? reinterpret_cast(vec_add(vx, vmask)) : + reinterpret_cast(vx); *reinterpret_cast(&D[0]) = vx1; - vsum = vec_sum4s (vx1, vsum); + vsum = vec_sum4s(vx1, vsum); D += 16; b += ldb*4; y -= 4; } while (y >= 4); } if (y >= 1) { - Vtype vb = reinterpret_cast(vec_splats(Flip)); + Vtype vb = vmask; __vector int vb1 = reinterpret_cast<__vector int>(vb); vb1[0] = *reinterpret_cast(&b[0]); - if( y >= 2) { + if (y >= 2) { vb1[1] = *reinterpret_cast(&b[ldb]); } - if( y >= 3) { + if (y >= 3) { vb1[2] = *reinterpret_cast(&b[ldb*2]); } Vtype vx = vec_perm(reinterpret_cast(vb1), reinterpret_cast(vb1), mask); - vec_t vx1 = reinterpret_cast(vec_add (vx, vmask)); + vec_t vx1 = BIsSigned ? reinterpret_cast(vec_add(vx, vmask)) : + reinterpret_cast(vx); *reinterpret_cast(&D[0]) = vx1; - vsum = vec_sum4s (vx1, vsum); + vsum = vec_sum4s(vx1, vsum); D += 16; } *ColumnSumBuffer++ = vsum[0]; @@ -600,7 +723,7 @@ MlasGemmQuantCopyPackB8x8( size_t y = CountK; if (y >= 4) { do { - Vtype vb = reinterpret_cast(vec_splats(Flip)); + Vtype vb = vmask; if (CountN == 1) { vb[0] = b[0]; vb[4] = b[ldb]; @@ -632,16 +755,17 @@ MlasGemmQuantCopyPackB8x8( vb[14] = b[ldb*3+2]; } Vtype vx = vec_perm(reinterpret_cast(vb), reinterpret_cast(vb), mask); - vec_t vx1 = reinterpret_cast(vec_add (vx, vmask)); + vec_t vx1 = BIsSigned ? reinterpret_cast(vec_add(vx, vmask)) : + reinterpret_cast(vx); *reinterpret_cast(&D[0]) = vx1; - vsum = vec_sum4s (vx1, vsum); + vsum = vec_sum4s(vx1, vsum); D += 16; b += ldb*4; y -= 4; } while (y >= 4); } if (y >= 1) { - Vtype vb = reinterpret_cast(vec_splats(Flip)); + Vtype vb = vmask; if (CountN == 1) { vb[0]= b[0]; if (y >= 2) { @@ -679,9 +803,10 @@ MlasGemmQuantCopyPackB8x8( } } Vtype vx = vec_perm(reinterpret_cast(vb), reinterpret_cast(vb), mask); - vec_t vx1 = reinterpret_cast(vec_add (vx, vmask)); + vec_t vx1 = BIsSigned ? reinterpret_cast(vec_add(vx, vmask)) : + reinterpret_cast(vx); *reinterpret_cast(&D[0]) = vx1; - vsum = vec_sum4s (vx1, vsum); + vsum = vec_sum4s(vx1, vsum); D += 16; } *ColumnSumBuffer++ = vsum[0]; @@ -707,9 +832,9 @@ MlasGemmQuantCopyPackA( ) { if (AIsSigned) { - MlasGemmQuantCopyPackA8x8<__vector signed char>(D, A, lda, CountM, CountK, RowSumBuffer, AIsSigned); + MlasGemmQuantCopyPackA8x8<__vector signed char, true>(D, A, lda, CountM, CountK, RowSumBuffer); } else { - MlasGemmQuantCopyPackA8x8<__vector unsigned char>(D, A, lda, CountM, CountK, RowSumBuffer, AIsSigned); + MlasGemmQuantCopyPackA8x8<__vector unsigned char, false>(D, A, lda, CountM, CountK, RowSumBuffer); } } template<> @@ -725,9 +850,9 @@ MlasGemmQuantCopyPackB( ) { if (BIsSigned) { - MlasGemmQuantCopyPackB8x8<__vector signed char>(D, B, ldb, CountN, CountK, ColumnSumBuffer, BIsSigned); + MlasGemmQuantCopyPackB8x8<__vector signed char, true>(D, B, ldb, CountN, CountK, ColumnSumBuffer); } else { - MlasGemmQuantCopyPackB8x8< __vector unsigned char>(D, B, ldb, CountN, CountK, ColumnSumBuffer, BIsSigned); + MlasGemmQuantCopyPackB8x8< __vector unsigned char, false>(D, B, ldb, CountN, CountK, ColumnSumBuffer); } } @@ -747,46 +872,93 @@ MlasQgemmStoreVectorMMA int pos ) { - __vector int *rowC; - __vector signed int vsum = {0}; + size_t RowCount; + __vector signed int vsum0, vsum1, vsum2, vsum3; + __vector signed int columnsum = *reinterpret_cast(&ColumnSumBuffer[pos]); + C += VectorCount; if (ZeroPointB != nullptr) { + __vector signed int zeropoint = *reinterpret_cast(&ZeroPointB[pos]); if (ZeroMode) { - for (size_t RowCount = 0;RowCount < row; RowCount++){ - vsum[0] = RowSumBuffer[RowCount] * ZeroPointB[pos] + ColumnSumBuffer[pos]; - vsum[1] = RowSumBuffer[RowCount] * ZeroPointB[pos+1] + ColumnSumBuffer[pos+1]; - vsum[2] = RowSumBuffer[RowCount] * ZeroPointB[pos+2] + ColumnSumBuffer[pos+2]; - vsum[3] = RowSumBuffer[RowCount] * ZeroPointB[pos+3] + ColumnSumBuffer[pos+3]; - rowC = reinterpret_cast<__vector int *>(&C[ldc * RowCount + VectorCount]); - rowC[0] = *reinterpret_cast<__vector int *>(&result[RowCount]) + vsum; + for (RowCount = 0; RowCount + 4 <= row; RowCount += 4, C += ldc*4) { + vsum0 = vec_splats(RowSumBuffer[RowCount + 0]) * zeropoint + columnsum; + vsum1 = vec_splats(RowSumBuffer[RowCount + 1]) * zeropoint + columnsum; + vsum2 = vec_splats(RowSumBuffer[RowCount + 2]) * zeropoint + columnsum; + vsum3 = vec_splats(RowSumBuffer[RowCount + 3]) * zeropoint + columnsum; + *reinterpret_cast<__vector int *>(&C[0]) = + *reinterpret_cast<__vector int *>(&result[RowCount + 0]) + vsum0; + *reinterpret_cast<__vector int *>(&C[ldc]) = + *reinterpret_cast<__vector int *>(&result[RowCount + 1]) + vsum1; + *reinterpret_cast<__vector int *>(&C[ldc*2]) = + *reinterpret_cast<__vector int *>(&result[RowCount + 2]) + vsum2; + *reinterpret_cast<__vector int *>(&C[ldc*3]) = + *reinterpret_cast<__vector int *>(&result[RowCount + 3]) + vsum3; + } + for (; RowCount < row; RowCount++, C += ldc) { + vsum0 = vec_splats(RowSumBuffer[RowCount]) * zeropoint + columnsum; + *reinterpret_cast<__vector int *>(&C[0]) = + *reinterpret_cast<__vector int *>(&result[RowCount + 0]) + vsum0; } } else { - for (size_t RowCount = 0;RowCount < row; RowCount++){ - vsum[0] = RowSumBuffer[RowCount] * ZeroPointB[pos] + ColumnSumBuffer[pos]; - vsum[1] = RowSumBuffer[RowCount] * ZeroPointB[pos+1] + ColumnSumBuffer[pos+1]; - vsum[2] = RowSumBuffer[RowCount] * ZeroPointB[pos+2] + ColumnSumBuffer[pos+2]; - vsum[3] = RowSumBuffer[RowCount] * ZeroPointB[pos+3] + ColumnSumBuffer[pos+3]; - rowC = reinterpret_cast<__vector int *>(&C[ldc * RowCount + VectorCount]); - rowC[0] += *reinterpret_cast<__vector int *>(&result[RowCount]) + vsum; + for (RowCount = 0; RowCount + 4 <= row; RowCount += 4, C += ldc*4) { + vsum0 = vec_splats(RowSumBuffer[RowCount + 0]) * zeropoint + columnsum; + vsum1 = vec_splats(RowSumBuffer[RowCount + 1]) * zeropoint + columnsum; + vsum2 = vec_splats(RowSumBuffer[RowCount + 2]) * zeropoint + columnsum; + vsum3 = vec_splats(RowSumBuffer[RowCount + 3]) * zeropoint + columnsum; + *reinterpret_cast<__vector int *>(&C[0]) += + *reinterpret_cast<__vector int *>(&result[RowCount + 0]) + vsum0; + *reinterpret_cast<__vector int *>(&C[ldc]) += + *reinterpret_cast<__vector int *>(&result[RowCount + 1]) + vsum1; + *reinterpret_cast<__vector int *>(&C[ldc*2]) += + *reinterpret_cast<__vector int *>(&result[RowCount + 2]) + vsum2; + *reinterpret_cast<__vector int *>(&C[ldc*3]) += + *reinterpret_cast<__vector int *>(&result[RowCount + 3]) + vsum3; + } + for (; RowCount < row; RowCount++, C += ldc) { + vsum0 = vec_splats(RowSumBuffer[RowCount]) * zeropoint + columnsum; + *reinterpret_cast<__vector int *>(&C[0]) += + *reinterpret_cast<__vector int *>(&result[RowCount + 0]) + vsum0; } } } else { if (ZeroMode) { - for (size_t RowCount = 0;RowCount < row; RowCount++){ - vsum[0] = RowSumBuffer[RowCount] + ColumnSumBuffer[pos]; - vsum[1] = RowSumBuffer[RowCount] + ColumnSumBuffer[pos+1]; - vsum[2] = RowSumBuffer[RowCount] + ColumnSumBuffer[pos+2]; - vsum[3] = RowSumBuffer[RowCount] + ColumnSumBuffer[pos+3]; - rowC = reinterpret_cast<__vector int *>(&C[ldc * RowCount + VectorCount]); - rowC[0] = *reinterpret_cast<__vector int *>(&result[RowCount]) + vsum; + for (RowCount = 0; RowCount + 4 <= row; RowCount += 4, C += ldc*4) { + vsum0 = vec_splats(RowSumBuffer[RowCount + 0]) + columnsum; + vsum1 = vec_splats(RowSumBuffer[RowCount + 1]) + columnsum; + vsum2 = vec_splats(RowSumBuffer[RowCount + 2]) + columnsum; + vsum3 = vec_splats(RowSumBuffer[RowCount + 3]) + columnsum; + *reinterpret_cast<__vector int *>(&C[0]) = + *reinterpret_cast<__vector int *>(&result[RowCount + 0]) + vsum0; + *reinterpret_cast<__vector int *>(&C[ldc]) = + *reinterpret_cast<__vector int *>(&result[RowCount + 1]) + vsum1; + *reinterpret_cast<__vector int *>(&C[ldc*2]) = + *reinterpret_cast<__vector int *>(&result[RowCount + 2]) + vsum2; + *reinterpret_cast<__vector int *>(&C[ldc*3]) = + *reinterpret_cast<__vector int *>(&result[RowCount + 3]) + vsum3; + } + for (; RowCount < row; RowCount++, C += ldc) { + vsum0 = vec_splats(RowSumBuffer[RowCount]) + columnsum; + *reinterpret_cast<__vector int *>(&C[0]) = + *reinterpret_cast<__vector int *>(&result[RowCount + 0]) + vsum0; } } else { - for (size_t RowCount = 0;RowCount < row; RowCount++){ - vsum[0] = RowSumBuffer[RowCount] + ColumnSumBuffer[pos]; - vsum[1] = RowSumBuffer[RowCount] + ColumnSumBuffer[pos+1]; - vsum[2] = RowSumBuffer[RowCount] + ColumnSumBuffer[pos+2]; - vsum[3] = RowSumBuffer[RowCount] + ColumnSumBuffer[pos+3]; - rowC = reinterpret_cast<__vector int *>(&C[ldc * RowCount + VectorCount]); - rowC[0] += *reinterpret_cast<__vector int *>(&result[RowCount]) + vsum; + for (RowCount = 0; RowCount + 4 <= row; RowCount += 4, C += ldc*4) { + vsum0 = vec_splats(RowSumBuffer[RowCount + 0]) + columnsum; + vsum1 = vec_splats(RowSumBuffer[RowCount + 1]) + columnsum; + vsum2 = vec_splats(RowSumBuffer[RowCount + 2]) + columnsum; + vsum3 = vec_splats(RowSumBuffer[RowCount + 3]) + columnsum; + *reinterpret_cast<__vector int *>(&C[0]) += + *reinterpret_cast<__vector int *>(&result[RowCount + 0]) + vsum0; + *reinterpret_cast<__vector int *>(&C[ldc]) += + *reinterpret_cast<__vector int *>(&result[RowCount + 1]) + vsum1; + *reinterpret_cast<__vector int *>(&C[ldc*2]) += + *reinterpret_cast<__vector int *>(&result[RowCount + 2]) + vsum2; + *reinterpret_cast<__vector int *>(&C[ldc*3]) += + *reinterpret_cast<__vector int *>(&result[RowCount + 3]) + vsum3; + } + for (; RowCount < row; RowCount++, C += ldc) { + vsum0 = vec_splats(RowSumBuffer[RowCount]) + columnsum; + *reinterpret_cast<__vector int *>(&C[0]) += + *reinterpret_cast<__vector int *>(&result[RowCount + 0]) + vsum0; } } } @@ -846,36 +1018,36 @@ MlasQgemmComputeMMA( ) { if (CountK == 16) { - __builtin_mma_xvi8ger4pp (acc0, va[0], vb[0]); - __builtin_mma_xvi8ger4pp (acc0, va[1], vb[1]); - __builtin_mma_xvi8ger4pp (acc0, va[2], vb[2]); - __builtin_mma_xvi8ger4pp (acc0, va[3], vb[3]); + __builtin_mma_xvi8ger4pp(acc0, va[0], vb[0]); + __builtin_mma_xvi8ger4pp(acc0, va[1], vb[1]); + __builtin_mma_xvi8ger4pp(acc0, va[2], vb[2]); + __builtin_mma_xvi8ger4pp(acc0, va[3], vb[3]); if (CountM) { - __builtin_mma_xvi8ger4pp (acc1, va[4], vb[0]); - __builtin_mma_xvi8ger4pp (acc1, va[5], vb[1]); - __builtin_mma_xvi8ger4pp (acc1, va[6], vb[2]); - __builtin_mma_xvi8ger4pp (acc1, va[7], vb[3]); + __builtin_mma_xvi8ger4pp(acc1, va[4], vb[0]); + __builtin_mma_xvi8ger4pp(acc1, va[5], vb[1]); + __builtin_mma_xvi8ger4pp(acc1, va[6], vb[2]); + __builtin_mma_xvi8ger4pp(acc1, va[7], vb[3]); } } else if (CountK == 12) { - __builtin_mma_xvi8ger4pp (acc0, va[0], vb[0]); - __builtin_mma_xvi8ger4pp (acc0, va[1], vb[1]); - __builtin_mma_xvi8ger4pp (acc0, va[2], vb[2]); + __builtin_mma_xvi8ger4pp(acc0, va[0], vb[0]); + __builtin_mma_xvi8ger4pp(acc0, va[1], vb[1]); + __builtin_mma_xvi8ger4pp(acc0, va[2], vb[2]); if (CountM) { - __builtin_mma_xvi8ger4pp (acc1, va[3], vb[0]); - __builtin_mma_xvi8ger4pp (acc1, va[4], vb[1]); - __builtin_mma_xvi8ger4pp (acc1, va[5], vb[2]); + __builtin_mma_xvi8ger4pp(acc1, va[3], vb[0]); + __builtin_mma_xvi8ger4pp(acc1, va[4], vb[1]); + __builtin_mma_xvi8ger4pp(acc1, va[5], vb[2]); } } else if (CountK == 8) { - __builtin_mma_xvi8ger4pp (acc0, va[0], vb[0]); - __builtin_mma_xvi8ger4pp (acc0, va[1], vb[1]); + __builtin_mma_xvi8ger4pp(acc0, va[0], vb[0]); + __builtin_mma_xvi8ger4pp(acc0, va[1], vb[1]); if (CountM) { - __builtin_mma_xvi8ger4pp (acc1, va[2], vb[0]); - __builtin_mma_xvi8ger4pp (acc1, va[3], vb[1]); + __builtin_mma_xvi8ger4pp(acc1, va[2], vb[0]); + __builtin_mma_xvi8ger4pp(acc1, va[3], vb[1]); } } else { - __builtin_mma_xvi8ger4pp (acc0, va[0], vb[0]); + __builtin_mma_xvi8ger4pp(acc0, va[0], vb[0]); if (CountM) { - __builtin_mma_xvi8ger4pp (acc1, va[1], vb[0]); + __builtin_mma_xvi8ger4pp(acc1, va[1], vb[0]); } } }; @@ -902,7 +1074,7 @@ MlasGemmQuantKernel( if (Mval >= 8) { Mval = 4; } - while(CountN > 0) { + while (CountN > 0) { const int8_t *a = A; typedef __vector unsigned char vec_t; const uint8_t *b = B; @@ -1057,23 +1229,23 @@ MlasGemmQuantKernel( } // Store matrix C with accumulator result. if (CountN >=16) { - __builtin_mma_disassemble_acc (reinterpret_cast(result), &acc0); + __builtin_mma_disassemble_acc(reinterpret_cast(result), &acc0); MlasQgemmStoreVectorMMA<0>(result, C, ldc, Mval, ZeroMode, RowSumBuffer, ColumnSumBuffer, ZeroPointB, 0); - __builtin_mma_disassemble_acc (reinterpret_cast(result), &acc1); + __builtin_mma_disassemble_acc(reinterpret_cast(result), &acc1); MlasQgemmStoreVectorMMA<4>(result, C, ldc, Mval, ZeroMode, RowSumBuffer, ColumnSumBuffer, ZeroPointB, 4); - __builtin_mma_disassemble_acc (reinterpret_cast(result), &acc2); + __builtin_mma_disassemble_acc(reinterpret_cast(result), &acc2); MlasQgemmStoreVectorMMA<8>(result, C, ldc, Mval, ZeroMode, RowSumBuffer, ColumnSumBuffer, ZeroPointB, 8); - __builtin_mma_disassemble_acc (reinterpret_cast(result), &acc3); + __builtin_mma_disassemble_acc(reinterpret_cast(result), &acc3); MlasQgemmStoreVectorMMA<12>(result, C, ldc, Mval, ZeroMode, RowSumBuffer, ColumnSumBuffer, ZeroPointB, 12); if (CountM >= 8) { C1 = C+ldc*4; - __builtin_mma_disassemble_acc (reinterpret_cast(result), &acc4); + __builtin_mma_disassemble_acc(reinterpret_cast(result), &acc4); MlasQgemmStoreVectorMMA<0>(result, C1, ldc, 4, ZeroMode, RowSumBuffer+4, ColumnSumBuffer, ZeroPointB, 0); - __builtin_mma_disassemble_acc (reinterpret_cast(result), &acc5); + __builtin_mma_disassemble_acc(reinterpret_cast(result), &acc5); MlasQgemmStoreVectorMMA<4>(result, C1, ldc, 4, ZeroMode, RowSumBuffer+4, ColumnSumBuffer, ZeroPointB, 4); - __builtin_mma_disassemble_acc (reinterpret_cast(result), &acc6); + __builtin_mma_disassemble_acc(reinterpret_cast(result), &acc6); MlasQgemmStoreVectorMMA<8>(result, C1, ldc, 4, ZeroMode, RowSumBuffer+4, ColumnSumBuffer, ZeroPointB, 8); - __builtin_mma_disassemble_acc (reinterpret_cast(result), &acc7); + __builtin_mma_disassemble_acc(reinterpret_cast(result), &acc7); MlasQgemmStoreVectorMMA<12>(result, C1, ldc, 4, ZeroMode, RowSumBuffer+4, ColumnSumBuffer, ZeroPointB, 12); } INC_BUFFER(16); @@ -1082,72 +1254,72 @@ MlasGemmQuantKernel( C += 16; } else { if (CountN >=12 ) { - __builtin_mma_disassemble_acc (reinterpret_cast(result), &acc0); + __builtin_mma_disassemble_acc(reinterpret_cast(result), &acc0); MlasQgemmStoreVectorMMA<0>(result, C, ldc, Mval, ZeroMode, RowSumBuffer, ColumnSumBuffer, ZeroPointB, 0); - __builtin_mma_disassemble_acc (reinterpret_cast(result), &acc1); + __builtin_mma_disassemble_acc(reinterpret_cast(result), &acc1); MlasQgemmStoreVectorMMA<4>(result, C, ldc, Mval, ZeroMode, RowSumBuffer, ColumnSumBuffer, ZeroPointB, 4); - __builtin_mma_disassemble_acc (reinterpret_cast(result), &acc2); + __builtin_mma_disassemble_acc(reinterpret_cast(result), &acc2); MlasQgemmStoreVectorMMA<8>(result, C, ldc, Mval, ZeroMode, RowSumBuffer, ColumnSumBuffer, ZeroPointB, 8); if (CountM >= 8) { C1 = C+ldc*4; - __builtin_mma_disassemble_acc (reinterpret_cast(result), &acc4); + __builtin_mma_disassemble_acc(reinterpret_cast(result), &acc4); MlasQgemmStoreVectorMMA<0>(result, C1, ldc, 4, ZeroMode, RowSumBuffer+4, ColumnSumBuffer, ZeroPointB, 0); - __builtin_mma_disassemble_acc (reinterpret_cast(result), &acc5); + __builtin_mma_disassemble_acc(reinterpret_cast(result), &acc5); MlasQgemmStoreVectorMMA<4>(result, C1, ldc, 4, ZeroMode, RowSumBuffer+4, ColumnSumBuffer, ZeroPointB, 4); - __builtin_mma_disassemble_acc (reinterpret_cast(result), &acc6); + __builtin_mma_disassemble_acc(reinterpret_cast(result), &acc6); MlasQgemmStoreVectorMMA<8>(result, C1, ldc, 4, ZeroMode, RowSumBuffer+4, ColumnSumBuffer, ZeroPointB, 8); } INC_BUFFER(12); if (CountN - 12 > 0) { - __builtin_mma_disassemble_acc (reinterpret_cast(result), &acc3); + __builtin_mma_disassemble_acc(reinterpret_cast(result), &acc3); if (CountM >= 8) { - __builtin_mma_disassemble_acc (reinterpret_cast(result1), &acc7); + __builtin_mma_disassemble_acc(reinterpret_cast(result1), &acc7); } } CountN -= 12; C += 12; } else if (CountN >= 8) { - __builtin_mma_disassemble_acc (reinterpret_cast(result), &acc0); + __builtin_mma_disassemble_acc(reinterpret_cast(result), &acc0); MlasQgemmStoreVectorMMA<0>(result, C, ldc, Mval, ZeroMode, RowSumBuffer, ColumnSumBuffer, ZeroPointB, 0); - __builtin_mma_disassemble_acc (reinterpret_cast(result), &acc1); + __builtin_mma_disassemble_acc(reinterpret_cast(result), &acc1); MlasQgemmStoreVectorMMA<4>(result, C, ldc, Mval, ZeroMode, RowSumBuffer, ColumnSumBuffer, ZeroPointB, 4); if (CountM >= 8) { C1 = C+ldc*4; - __builtin_mma_disassemble_acc (reinterpret_cast(result), &acc4); + __builtin_mma_disassemble_acc(reinterpret_cast(result), &acc4); MlasQgemmStoreVectorMMA<0>(result, C1, ldc, 4, ZeroMode, RowSumBuffer+4, ColumnSumBuffer, ZeroPointB, 0); - __builtin_mma_disassemble_acc (reinterpret_cast(result), &acc5); + __builtin_mma_disassemble_acc(reinterpret_cast(result), &acc5); MlasQgemmStoreVectorMMA<4>(result, C1, ldc, 4, ZeroMode, RowSumBuffer+4, ColumnSumBuffer, ZeroPointB, 4); } INC_BUFFER(8); if (CountN - 8 > 0) { - __builtin_mma_disassemble_acc (reinterpret_cast(result), &acc2); + __builtin_mma_disassemble_acc(reinterpret_cast(result), &acc2); if (CountM >= 8) { - __builtin_mma_disassemble_acc (reinterpret_cast(result1), &acc6); + __builtin_mma_disassemble_acc(reinterpret_cast(result1), &acc6); } } CountN -= 8; C += 8; } else if (CountN >= 4) { - __builtin_mma_disassemble_acc (reinterpret_cast(result), &acc0); + __builtin_mma_disassemble_acc(reinterpret_cast(result), &acc0); MlasQgemmStoreVectorMMA<0>(result, C, ldc, Mval, ZeroMode, RowSumBuffer, ColumnSumBuffer, ZeroPointB, 0); if (CountM >= 8) { C1 = C+ldc*4; - __builtin_mma_disassemble_acc (reinterpret_cast(result), &acc4); + __builtin_mma_disassemble_acc(reinterpret_cast(result), &acc4); MlasQgemmStoreVectorMMA<0>(result, C1, ldc, 4, ZeroMode, RowSumBuffer+4, ColumnSumBuffer, ZeroPointB, 0); if (CountN - 4 > 0) { - __builtin_mma_disassemble_acc (reinterpret_cast(result1), &acc5); + __builtin_mma_disassemble_acc(reinterpret_cast(result1), &acc5); } } INC_BUFFER(4); if (CountN - 4 > 0) { - __builtin_mma_disassemble_acc (reinterpret_cast(result), &acc1); + __builtin_mma_disassemble_acc(reinterpret_cast(result), &acc1); } CountN -= 4; C += 4; } else { - __builtin_mma_disassemble_acc (reinterpret_cast(result), &acc0); + __builtin_mma_disassemble_acc(reinterpret_cast(result), &acc0); if (CountM >= 8) { - __builtin_mma_disassemble_acc (reinterpret_cast(result1), &acc4); + __builtin_mma_disassemble_acc(reinterpret_cast(result1), &acc4); } } CountN &= 3; diff --git a/onnxruntime/core/optimizer/gather_fusion.cc b/onnxruntime/core/optimizer/gather_fusion.cc index 1f2b31526c6b8..2bde320786130 100644 --- a/onnxruntime/core/optimizer/gather_fusion.cc +++ b/onnxruntime/core/optimizer/gather_fusion.cc @@ -268,7 +268,7 @@ Status GatherSliceToSplitFusion::ApplyImpl(Graph& graph, bool& modified, int gra } ONNX_NAMESPACE::TensorProto split_initializer_proto; - split_initializer_proto.set_name(graph.GenerateNodeName("splits")); + split_initializer_proto.set_name(graph.GenerateNodeArgName("splits")); split_initializer_proto.set_data_type(ONNX_NAMESPACE::TensorProto_DataType_INT64); split_initializer_proto.add_dims(static_cast(split_values.size())); split_initializer_proto.mutable_int64_data()->Add(split_values.begin(), split_values.end()); diff --git a/onnxruntime/core/providers/cpu/quantization/quantize_linear.cc b/onnxruntime/core/providers/cpu/quantization/quantize_linear.cc index 05dea2a05c97b..91e21b3690b27 100644 --- a/onnxruntime/core/providers/cpu/quantization/quantize_linear.cc +++ b/onnxruntime/core/providers/cpu/quantization/quantize_linear.cc @@ -25,10 +25,7 @@ class DequantizeLinear final : public OpKernel { block_size_ = 0; } - // TODO(adrianlizarraga): Support the block_size attribute added in opset 21. - if (block_size_ != 0) { - ORT_THROW("DequantizeLinear does not yet support the 'block_size' attribute."); - } + ORT_ENFORCE(block_size_ >= 0, "'block_size' must be non-negative."); } Status Compute(OpKernelContext* context) const override; @@ -71,31 +68,55 @@ static void PrepareForQDQ(const TensorShape& input_shape, const Tensor& scale, const Tensor* zero_point_ptr, int64_t axis, - int64_t& quant_block_count, // A "quant block" is a block of elems with the same scale/zp - int64_t& axis_dim_val, - int64_t& quant_block_size) { + int64_t quant_block_size, + int64_t& process_block_count, + int64_t& broadcast_dim, + int64_t& process_block_size) { if (IsScalarOr1ElementVector(&scale)) { // per-tensor QuantizeLinear/DequantizeLinear - quant_block_count = 1; - axis_dim_val = 1; - quant_block_size = static_cast(input_shape.Size()); + process_block_count = 1; + broadcast_dim = 1; + process_block_size = static_cast(input_shape.Size()); // enforce that zero point are scalars ORT_ENFORCE(zero_point_ptr == nullptr || IsScalarOr1ElementVector(zero_point_ptr), "x_zero_point must be null or a scalar or 1D tensor or size 1."); - } else { // per-channel QuantizeLinear/DequantizeLinear + ORT_ENFORCE(quant_block_size == 0, "block_size must be 0 for per-tensor quantization."); + } else { // per-axis or blocked QuantizeLinear/DequantizeLinear const int64_t axis_no_neg = HandleNegativeAxis(axis, input_shape.NumDimensions()); - quant_block_count = input_shape.SizeToDimension(onnxruntime::narrow(axis_no_neg)); - axis_dim_val = input_shape[onnxruntime::narrow(axis_no_neg)]; - quant_block_size = input_shape.SizeFromDimension(SafeInt(axis_no_neg) + 1); + process_block_count = input_shape.SizeToDimension(onnxruntime::narrow(axis_no_neg)); + broadcast_dim = input_shape[onnxruntime::narrow(axis_no_neg)]; + process_block_size = input_shape.SizeFromDimension(SafeInt(axis_no_neg) + 1); // if an axis was specified, ensure the scale and zero point are compatible - ORT_ENFORCE(scale.Shape().NumDimensions() == 1 && scale.Shape()[0] == axis_dim_val, - "scale must be 1D tensor with size ", - axis_dim_val); - ORT_ENFORCE(zero_point_ptr == nullptr || - (zero_point_ptr->Shape().NumDimensions() == 1 && zero_point_ptr->Shape()[0] == axis_dim_val), - "x_zero_point must be null or 1D tensor with size ", - axis_dim_val); + if (quant_block_size) { // blocked quantization + ORT_ENFORCE(scale.Shape().NumDimensions() == input_shape.NumDimensions(), + "x_scale and x must have the same rank for blocked quantization"); + ORT_ENFORCE(zero_point_ptr == nullptr || zero_point_ptr->Shape().NumDimensions() == input_shape.NumDimensions(), + "x_zero_point must be null or have the same rank as x for blocked quantization"); + + for (size_t i = 0, ndim = input_shape.NumDimensions(); i < ndim; ++i) { + if (i == SafeInt(axis_no_neg)) { + ORT_ENFORCE(scale.Shape()[i] == (input_shape[i] + quant_block_size - 1) / quant_block_size, + "x_scale must be ceil(Di/block_size) on the quantize axis i for blocked quantization"); + } else { + ORT_ENFORCE(scale.Shape()[i] == input_shape[i], + "x_scale and x must have the same shape despite the quantize axis for blocked quantization"); + } + + if (zero_point_ptr) { + ORT_ENFORCE(zero_point_ptr->Shape()[i] == scale.Shape()[i], + "x_zero_point and x_scale must have the same shape for blocked quantization"); + } + } + } else { // per-axis quantization + ORT_ENFORCE(scale.Shape().NumDimensions() == 1 && scale.Shape()[0] == broadcast_dim, + "For per axis quantization, scale must be 1D tensor with size ", + broadcast_dim); + ORT_ENFORCE(zero_point_ptr == nullptr || (zero_point_ptr->Shape().NumDimensions() == 1 && + zero_point_ptr->Shape()[0] == broadcast_dim), + "For per axis quantization, x_zero_point must be null or 1D tensor with size ", + broadcast_dim); + } } } @@ -244,66 +265,198 @@ ONNX_CPU_OPERATOR_TYPED_MS_KERNEL( } // namespace contrib #endif // !defined(DISABLE_CONTRIB_OPS) +template +struct DequantizeLinearApply; + +// The dimensions before quantize axis and after quantize axis can be flattened. +// After flattening, the tensor can be represented by a rank-3 tensor. +// If the quantization happens on the first or last axis, the flattened tensor is +// effectively rank-2. +// For per tensor quantization, the tensor is effectively rank-1. template -struct DequantizeLinearApply { - void op(int64_t N, int64_t axis_dim_val, int64_t quant_block_size, const T* input, const OutT* scale, OutT* output, - const T* zero_point) { - for (size_t n = 0; n < static_cast(N); n++) { - for (size_t bd = 0; bd < static_cast(axis_dim_val); bd++) { - auto zp = zero_point ? static_cast(zero_point[bd]) : 0; - auto sc = static_cast(scale[bd]); - for (size_t bs = 0; bs < static_cast(quant_block_size); bs++) { +struct DequantizeLinearApply { + /** + * @brief Calculate per-tensor/layer or per-axis quantization of DequantizeLinear on the + * flattened tensors. + * @param[in] M size of dimensions before the quantize axis + * @param[in] K dimension on the quantize axis + * @param[in] N size of dimensions after the quantize axis + * @param[in] input 1D array of flattened [D0, ..., Di, ..., Dn] + * @param[in] scale scalar for per-tensor/layer quantization and 1D array [Di] + * for per-axis quantization. i is the quantize axis. + * @param[out] output same shape as input + * @param[in] zero_point same shape as scale + */ + void op(size_t M, size_t K, size_t N, const T* input, + const OutT* scale, OutT* output, const T* zero_point) { + for (size_t m = 0; m < M; m++) { + for (size_t k = 0; k < K; k++) { + auto zp = zero_point ? static_cast(zero_point[k]) : 0; + auto sc = static_cast(scale[k]); + for (size_t n = 0; n < N; n++) { *output++ = static_cast(static_cast(static_cast(*input++) - zp) * sc); } } } } + + /** + * @brief Calculate blocked quantization of DequantizeLinear on the flattened tensors. + * TODO(fajin): add mlas kernel to utilize multithreading, refer MlasDequantizeBlockwise. + * @param[in] M size of dimensions before the quantize axis + * @param[in] K dimension of the quantize axis + * @param[in] N size of dimensions after the quantize axis + * @param[in] quant_block_size quantize block size along the quantize axis + * @param[in] input 1D array of flattened [D0, ..., Di, ..., Dn] + * @param[in] scale 1D array of flattened [D0, ..., ceil(Di/quant_block_size), ..., Dn]. + * i is the quantize axis. + * @param[out] output same shape as input + * @param[in] zero_point same shape as scale + */ + void op(size_t M, size_t K, size_t N, size_t quant_block_size, + const T* input, const OutT* scale, OutT* output, const T* zero_point) { + if (zero_point) { + for (size_t m = 0; m < M; m++) { + for (size_t bd = 0; bd < K; bd += quant_block_size) { + for (size_t qb = 0, qb_end = std::min(quant_block_size, K - bd); qb < qb_end; ++qb) { + // within the quantize block, the zero point and scale are the same. + for (size_t bs = 0; bs < N; bs++) { + auto zp = static_cast(zero_point[bs]); + auto sc = static_cast(scale[bs]); + *output++ = static_cast(static_cast(static_cast(*input++) - zp) * sc); + } + } + + // move to the next quantize block + zero_point += N; + scale += N; + } + } + } else { + for (size_t m = 0; m < M; m++) { + for (size_t bd = 0; bd < K; bd += quant_block_size) { + for (size_t qb = 0, qb_end = std::min(quant_block_size, K - bd); qb < qb_end; ++qb) { + // within the quantize block, the zero point and scale are the same. + for (size_t bs = 0; bs < N; bs++) { + auto sc = static_cast(scale[bs]); + *output++ = static_cast(static_cast(static_cast(*input++)) * sc); + } + } + + // move to the next quantize block + scale += N; + } + } + } + } }; -#define DEQUANTIZE_LINEAR_APPLY_INT4(T) \ - template \ - struct DequantizeLinearApply { \ - void op(int64_t N, int64_t axis_dim_val, int64_t quant_block_size, const T* input, const OutT* scale, \ - OutT* output, const T* zero_point) { \ - size_t input_index = 0; \ - for (size_t n = 0; n < static_cast(N); n++) { \ - for (size_t bd = 0; bd < static_cast(axis_dim_val); bd++) { \ - size_t bd_i = bd >> 1; /*bd / 2*/ \ - size_t bd_j = bd & 0x1; /*bd % 2*/ \ - auto zp = zero_point ? static_cast(zero_point[bd_i].GetElem(bd_j)) : 0; \ - auto sc = static_cast(scale[bd]); \ - for (size_t bs = 0; bs < static_cast(quant_block_size); bs++) { \ - size_t input_i = input_index >> 1; \ - size_t input_j = input_index & 0x1; \ - int32_t val = static_cast(input[input_i].GetElem(input_j)); \ - *output++ = static_cast(static_cast(val - zp) * sc); \ - input_index += 1; \ - } \ - } \ - } \ - assert(input_index == static_cast(N * axis_dim_val * quant_block_size)); \ - } \ - }; +template +struct DequantizeLinearApply { + // per-tensor/layer or per-axis quantization + void op(size_t M, size_t K, size_t N, + const T* input, const OutT* scale, OutT* output, const T* zero_point) { + size_t input_index = 0; + + for (size_t m = 0; m < M; m++) { + for (size_t bd = 0; bd < K; bd++) { + size_t bd_i = bd >> 1; /*bd / 2*/ + size_t bd_j = bd & 0x1; /*bd % 2*/ + auto zp = zero_point ? static_cast(zero_point[bd_i].GetElem(bd_j)) : 0; + auto sc = static_cast(scale[bd]); + + for (size_t bs = 0; bs < N; bs++) { + size_t input_i = input_index >> 1; + size_t input_j = input_index & 0x1; + int32_t val = static_cast(input[input_i].GetElem(input_j)); + *output++ = static_cast(static_cast(val - zp) * sc); + input_index += 1; + } + } + } + + assert(input_index == M * K * N); + } + + // Blocked quantization + // TODO(fajin) : add mlas kernel to utilize multithreading, refer MlasDequantizeBlockwise. + void op(size_t M, size_t K, size_t N, size_t quant_block_size, + const T* input, const OutT* scale, OutT* output, const T* zero_point) { + size_t input_index = 0; + + if (zero_point) { + size_t zp_index = 0; + + for (size_t n = 0; n < M; n++) { + for (size_t bd = 0; bd < K; bd += quant_block_size) { + for (size_t qb = 0, qb_end = std::min(quant_block_size, K - bd); qb < qb_end; ++qb) { + auto q_zp_index = zp_index; + for (size_t bs = 0; bs < N; ++bs, ++input_index, ++q_zp_index) { + auto zp = static_cast(zero_point[q_zp_index >> 1].GetElem(q_zp_index & 0x1)); + auto sc = static_cast(scale[bs]); + + int32_t val = static_cast(input[input_index >> 1].GetElem(input_index & 0x1)); + *output++ = static_cast(static_cast(val - zp) * sc); + } + } + + scale += N; + zp_index += N; + } + } + } else { + for (size_t n = 0; n < M; n++) { + for (size_t bd = 0; bd < K; bd += quant_block_size) { + for (size_t qb = 0, qb_end = std::min(quant_block_size, K - bd); qb < qb_end; ++qb) { + for (size_t bs = 0; bs < N; ++bs, ++input_index) { + auto sc = static_cast(scale[bs]); + + int32_t val = static_cast(input[input_index >> 1].GetElem(input_index & 0x1)); + *output++ = static_cast(static_cast(val) * sc); + } + } + + scale += N; + } + } + } -DEQUANTIZE_LINEAR_APPLY_INT4(Int4x2); -DEQUANTIZE_LINEAR_APPLY_INT4(UInt4x2); + assert(input_index == M * K * N); + } +}; #if !defined(DISABLE_FLOAT8_TYPES) -#define DEQUANTIZE_LINEAR_APPLY_FLOAT8(T) \ - template \ - struct DequantizeLinearApply { \ - void op(int64_t N, int64_t axis_dim_val, int64_t quant_block_size, const T* input, const OutT* scale, \ - OutT* output, const T*) { \ - for (size_t n = 0; n < static_cast(N); n++) { \ - for (size_t bd = 0; bd < static_cast(axis_dim_val); bd++) { \ - auto sc = scale[bd]; \ - for (size_t bs = 0; bs < static_cast(quant_block_size); bs++, input++) { \ - *output++ = static_cast(input->ToFloat() * sc); \ - } \ - } \ - } \ - } \ +#define DEQUANTIZE_LINEAR_APPLY_FLOAT8(T) \ + template \ + struct DequantizeLinearApply { \ + /* Per-tensor/layer or per-axis quantization */ \ + void op(size_t M, size_t K, size_t N, \ + const T* input, const OutT* scale, OutT* output, const T*) { \ + for (size_t m = 0; m < M; m++) { \ + for (size_t bd = 0; bd < K; bd++) { \ + auto sc = scale[bd]; \ + for (size_t bs = 0; bs < N; bs++, input++) { \ + *output++ = static_cast(input->ToFloat() * sc); \ + } \ + } \ + } \ + } \ + /* Blocked quantization */ \ + void op(size_t M, size_t K, size_t N, size_t quant_block_size, \ + const T* input, const OutT* scale, OutT* output, const T*) { \ + for (size_t m = 0; m < M; m++) { \ + for (size_t bd = 0; bd < K; bd += quant_block_size) { \ + for (size_t qb = 0, qb_end = std::min(quant_block_size, K - bd); qb < qb_end; ++qb) { \ + for (size_t bs = 0; bs < N; bs++, input++) { \ + auto sc = static_cast(scale[bs]); \ + *output++ = static_cast(input->ToFloat() * sc); \ + } \ + } \ + scale += N; \ + } \ + } \ + } \ }; DEQUANTIZE_LINEAR_APPLY_FLOAT8(Float8E4M3FN) @@ -323,11 +476,12 @@ Status DequantizeLinear::Compute(OpKernelContext* ctx) const { const auto& x_shape = x.Shape(); auto& y = *ctx->Output(0, x_shape); - int64_t N; - int64_t axis_dim_val; - int64_t quant_block_size; + int64_t process_block_count; + int64_t broadcast_dim; + int64_t process_block_size; - PrepareForQDQ(x.Shape(), x_scale, x_zero_point, axis_, N, axis_dim_val, quant_block_size); + PrepareForQDQ(x.Shape(), x_scale, x_zero_point, axis_, block_size_, + process_block_count, broadcast_dim, process_block_size); const T* zero_point = x_zero_point ? x_zero_point->Data() : nullptr; @@ -345,15 +499,38 @@ Status DequantizeLinear::Compute(OpKernelContext* ctx) const { const auto to = x_scale.GetElementType(); const T* input = x.Data(); + constexpr bool is_4bit = boost::mp11::mp_contains, T>::value; if (to == ONNX_NAMESPACE::TensorProto::FLOAT) { const float* scale = x_scale.Data(); float* output = y.MutableData(); - DequantizeLinearApply().op(N, axis_dim_val, quant_block_size, input, scale, output, zero_point); + if (block_size_) { + DequantizeLinearApply().op(static_cast(process_block_count), + static_cast(broadcast_dim), + static_cast(process_block_size), + static_cast(block_size_), + input, scale, output, zero_point); + } else { + DequantizeLinearApply().op(static_cast(process_block_count), + static_cast(broadcast_dim), + static_cast(process_block_size), + input, scale, output, zero_point); + } } else if (to == ONNX_NAMESPACE::TensorProto::FLOAT16) { const MLFloat16* scale = x_scale.Data(); MLFloat16* output = y.MutableData(); - DequantizeLinearApply().op(N, axis_dim_val, quant_block_size, input, scale, output, zero_point); + if (block_size_) { + DequantizeLinearApply().op(static_cast(process_block_count), + static_cast(broadcast_dim), + static_cast(process_block_size), + static_cast(block_size_), + input, scale, output, zero_point); + } else { + DequantizeLinearApply().op(static_cast(process_block_count), + static_cast(broadcast_dim), + static_cast(process_block_size), + input, scale, output, zero_point); + } } else if (to == ONNX_NAMESPACE::TensorProto::BFLOAT16) { ORT_THROW("DequantizeLinear into BFLOAT16 is not implemented yet."); } else { @@ -524,14 +701,14 @@ void ParQuantizeLinear(const InputType* Input, } template -void ComputeLoop(OpKernelContext* ctx, const InT* input, const InT* scale, const T* zero_point, T* output, int64_t N, - int64_t axis_dim_val, int64_t quant_block_size, bool saturate) { - for (size_t n = 0; n < static_cast(N); n++) { - for (size_t bd = 0; bd < static_cast(axis_dim_val); bd++) { - ParQuantizeLinear(input, output, static_cast(quant_block_size), scale[bd], bd, zero_point, saturate, - ctx->GetOperatorThreadPool()); - input += quant_block_size; - output += quant_block_size; +void ComputeLoop(OpKernelContext* ctx, const InT* input, const InT* scale, const T* zero_point, T* output, + int64_t process_block_count, int64_t broadcast_dim, int64_t process_block_size, bool saturate) { + for (size_t n = 0; n < static_cast(process_block_count); n++) { + for (size_t bd = 0; bd < static_cast(broadcast_dim); bd++) { + ParQuantizeLinear(input, output, static_cast(process_block_size), scale[bd], bd, zero_point, + saturate, ctx->GetOperatorThreadPool()); + input += process_block_size; + output += process_block_size; } } } @@ -611,20 +788,21 @@ Status QuantizeLinear::Compute(OpKernelContext* ctx) const { const auto& x_shape = x.Shape(); auto& y = *ctx->Output(0, x_shape); - int64_t N; - int64_t axis_dim_val; - int64_t quant_block_size; - PrepareForQDQ(x.Shape(), y_scale, y_zero_point, axis_, N, axis_dim_val, quant_block_size); + int64_t process_block_count; + int64_t broadcast_dim; + int64_t process_block_size; + PrepareForQDQ(x.Shape(), y_scale, y_zero_point, axis_, block_size_, + process_block_count, broadcast_dim, process_block_size); const T* zero_point = y_zero_point != nullptr ? y_zero_point->Data() : nullptr; T* output = y.MutableData(); if (x.IsDataType()) { - ComputeLoop(ctx, x.Data(), y_scale.Data(), zero_point, output, N, axis_dim_val, - quant_block_size, saturate_); + ComputeLoop(ctx, x.Data(), y_scale.Data(), zero_point, output, + process_block_count, broadcast_dim, process_block_size, saturate_); } else if (x.IsDataType()) { - ComputeLoop(ctx, x.Data(), y_scale.Data(), zero_point, output, N, - axis_dim_val, quant_block_size, saturate_); + ComputeLoop(ctx, x.Data(), y_scale.Data(), zero_point, output, + process_block_count, broadcast_dim, process_block_size, saturate_); } else { ORT_THROW("Unsupported input type."); } diff --git a/onnxruntime/core/providers/cpu/tensor/trilu.cc b/onnxruntime/core/providers/cpu/tensor/trilu.cc index 91e429ef60d91..017bbcd44904e 100644 --- a/onnxruntime/core/providers/cpu/tensor/trilu.cc +++ b/onnxruntime/core/providers/cpu/tensor/trilu.cc @@ -31,7 +31,7 @@ ONNX_OPERATOR_KERNEL_EX( kOnnxDomain, 14, kCpuExecutionProvider, - KernelDefBuilder().MayInplace(0, 0).TypeConstraint("T", BuildKernelDefConstraints()), + KernelDefBuilder().MayInplace(0, 0).TypeConstraint("T", BuildKernelDefConstraints()), Trilu); template @@ -110,6 +110,9 @@ Status Trilu::Compute(OpKernelContext* ctx) const { case sizeof(double): status = TriluImpl(X, Y, k_val, up); break; + case sizeof(bool): + status = TriluImpl(X, Y, k_val, up); + break; default: ORT_THROW("Unsupported input data type of ", data_type); } diff --git a/onnxruntime/core/providers/cuda/cu_inc/binary_elementwise_impl.cuh b/onnxruntime/core/providers/cuda/cu_inc/binary_elementwise_impl.cuh index a41888d0df48b..1469f55f0bfda 100644 --- a/onnxruntime/core/providers/cuda/cu_inc/binary_elementwise_impl.cuh +++ b/onnxruntime/core/providers/cuda/cu_inc/binary_elementwise_impl.cuh @@ -2,6 +2,7 @@ // Licensed under the MIT License. #pragma once +#include #include #include "core/providers/cuda/shared_inc/cuda_utils.h" #include "core/providers/cuda/cu_inc/common.cuh" @@ -11,7 +12,8 @@ namespace cuda { // broadcast by computing output coordinate from offset, using fast_divmod template + bool lhs_need_compute, bool rhs_need_compute, int NumThreadsPerBlock, int NumElementsPerThread, + typename NumElemT> __global__ void _BinaryElementWise( int32_t output_rank, const TArray lhs_padded_strides, @@ -21,19 +23,19 @@ __global__ void _BinaryElementWise( const TArray fdm_output_strides, T* output_data, const FuncT& functor, - CUDA_LONG N) { - CUDA_LONG start = NumElementsPerThread * NumThreadsPerBlock * blockIdx.x + threadIdx.x; + NumElemT N) { + NumElemT start = NumElementsPerThread * NumThreadsPerBlock * blockIdx.x + threadIdx.x; T1 lvalue[NumElementsPerThread]; T2 rvalue[NumElementsPerThread]; - CUDA_LONG id = start; + NumElemT id = start; #pragma unroll for (int i = 0; i < NumElementsPerThread; i++) { if (id < N) { - CUDA_LONG lhs_index = (lhs_need_compute ? 0 : id); - CUDA_LONG rhs_index = (rhs_need_compute ? 0 : id); + NumElemT lhs_index = (lhs_need_compute ? 0 : id); + NumElemT rhs_index = (rhs_need_compute ? 0 : id); // compute indexes with broadcasting rules: https://github.com/onnx/onnx/blob/main/docs/Broadcasting.md - CUDA_LONG offset = id; + NumElemT offset = id; #pragma unroll for (auto dim = 0; dim < fdm_output_strides.Capacity(); dim++) { if (dim >= output_rank) { @@ -69,18 +71,19 @@ __global__ void _BinaryElementWise( } // for scalar broadcast or non-broadcast case -template +template __global__ void _BinaryElementWiseSimple( const T1* lhs_data, const T2* rhs_data, T* output_data, const FuncT func, - CUDA_LONG N) { - CUDA_LONG start = NumElementsPerThread * NumThreadsPerBlock * blockIdx.x + threadIdx.x; + NumElemT N) { + NumElemT start = NumElementsPerThread * NumThreadsPerBlock * blockIdx.x + threadIdx.x; T1 lvalue[NumElementsPerThread]; T2 rvalue[NumElementsPerThread]; - CUDA_LONG id = start; + NumElemT id = start; #pragma unroll for (int i = 0; i < NumElementsPerThread; i++) { if (id < N) { @@ -103,23 +106,24 @@ __global__ void _BinaryElementWiseSimple( } // for rhs per-channel broadcast case -template +template __global__ void _BinaryElementWiseRhsPerChannelBatch1( const T1* lhs_data, const T2* rhs_data, const fast_divmod fdm_H, T* output_data, FuncT func, - CUDA_LONG N) { - CUDA_LONG start = NumElementsPerThread * NumThreadsPerBlock * blockIdx.x + threadIdx.x; + NumElemT N) { + NumElemT start = NumElementsPerThread * NumThreadsPerBlock * blockIdx.x + threadIdx.x; T1 lvalue[NumElementsPerThread]; T2 rvalue[NumElementsPerThread]; - CUDA_LONG id = start; + NumElemT id = start; #pragma unroll for (int i = 0; i < NumElementsPerThread; i++) { if (id < N) { - CUDA_LONG rhs_id = fdm_H.div(id); + NumElemT rhs_id = fdm_H.div(id); lvalue[i] = lhs_data[id]; rvalue[i] = rhs_data[rhs_id]; @@ -138,7 +142,8 @@ __global__ void _BinaryElementWiseRhsPerChannelBatch1( } } -template +template __global__ void _BinaryElementWiseRhsPerChannelBatchN( const T1* lhs_data, const T2* rhs_data, @@ -146,16 +151,16 @@ __global__ void _BinaryElementWiseRhsPerChannelBatchN( const fast_divmod fdm_C, T* output_data, FuncT func, - CUDA_LONG N) { - CUDA_LONG start = NumElementsPerThread * NumThreadsPerBlock * blockIdx.x + threadIdx.x; + NumElemT N) { + NumElemT start = NumElementsPerThread * NumThreadsPerBlock * blockIdx.x + threadIdx.x; T1 lvalue[NumElementsPerThread]; T2 rvalue[NumElementsPerThread]; - CUDA_LONG id = start; + NumElemT id = start; #pragma unroll for (int i = 0; i < NumElementsPerThread; i++) { if (id < N) { - CUDA_LONG rhs_id = fdm_H.div(id); + NumElemT rhs_id = fdm_H.div(id); int q, r; fdm_C.divmod(rhs_id, q, r); rhs_id = r; @@ -189,27 +194,34 @@ void BinaryElementWiseNoBroadcastImpl( if (count == 0) // special case where there's a dim value of 0 in the output shape return; - #ifdef USE_ROCM +#ifdef USE_ROCM const int num_elements_per_thread = 2; const int num_threads_per_block = 512; - #else +#else const int num_elements_per_thread = GridDim::maxElementsPerThread; const int num_threads_per_block = GridDim::maxThreadsPerBlock; - #endif +#endif int blocksPerGrid = static_cast(CeilDiv(count, num_threads_per_block * num_elements_per_thread)); - CUDA_LONG N = static_cast(count); - _BinaryElementWiseSimple<<>>( - lhs_data, - rhs_data, - output_data, - func, - N); - +#define FUNC_CALL(NumElemT) \ + _BinaryElementWiseSimple \ + <<>>( \ + lhs_data, \ + rhs_data, \ + output_data, \ + func, \ + static_cast(N)); + size_t N = static_cast(count); + if (N > static_cast(std::numeric_limits::max())) { + FUNC_CALL(size_t); + } else { + FUNC_CALL(CUDA_LONG); + } +#undef FUNC_CALL } -template -void BinaryElementWiseImpl( +template +void _BinaryElementWiseImpl( cudaStream_t stream, int32_t output_rank_or_simple_broadcast, const TArray* lhs_padded_strides, @@ -225,90 +237,124 @@ void BinaryElementWiseImpl( if (count == 0) // special case where there's a dim value of 0 in the output shape return; - #ifdef USE_ROCM +#ifdef USE_ROCM const int num_elements_per_thread = 2; const int num_threads_per_block = 512; - #else +#else const int num_elements_per_thread = GridDim::maxElementsPerThread; const int num_threads_per_block = GridDim::maxThreadsPerBlock; - #endif +#endif int blocksPerGrid = static_cast(CeilDiv(count, num_threads_per_block * num_elements_per_thread)); - CUDA_LONG N = static_cast(count); + NumElemT N = static_cast(count); if (output_rank_or_simple_broadcast == static_cast(SimpleBroadcast::NoBroadcast)) { - _BinaryElementWiseSimple<<>>( - lhs_data, - rhs_data, - output_data, - func, - N); + _BinaryElementWiseSimple + <<>>( + lhs_data, + rhs_data, + output_data, + func, + N); } else if (output_rank_or_simple_broadcast == static_cast(SimpleBroadcast::LeftScalar)) { - _BinaryElementWiseSimple<<>>( - lhs_data, - rhs_data, - output_data, - func, - N); + _BinaryElementWiseSimple + <<>>( + lhs_data, + rhs_data, + output_data, + func, + N); } else if (output_rank_or_simple_broadcast == static_cast(SimpleBroadcast::RightScalar)) { - _BinaryElementWiseSimple<<>>( - lhs_data, - rhs_data, - output_data, - func, - N); + _BinaryElementWiseSimple + <<>>( + lhs_data, + rhs_data, + output_data, + func, + N); } else if (output_rank_or_simple_broadcast == static_cast(SimpleBroadcast::RightPerChannelBatch1)) { - _BinaryElementWiseRhsPerChannelBatch1<<>>( - lhs_data, - rhs_data, - fdm_H, - output_data, - func, - N); + _BinaryElementWiseRhsPerChannelBatch1 + <<>>( + lhs_data, + rhs_data, + fdm_H, + output_data, + func, + N); } else if (output_rank_or_simple_broadcast == static_cast(SimpleBroadcast::RightPerChannelBatchN)) { - _BinaryElementWiseRhsPerChannelBatchN<<>>( - lhs_data, - rhs_data, - fdm_H, - fdm_C, - output_data, - func, - N); + _BinaryElementWiseRhsPerChannelBatchN + <<>>( + lhs_data, + rhs_data, + fdm_H, + fdm_C, + output_data, + func, + N); } else { if (lhs_padded_strides && rhs_padded_strides && lhs_padded_strides->Size() && rhs_padded_strides->Size()) - _BinaryElementWise<<>>( - output_rank_or_simple_broadcast, - *lhs_padded_strides, - lhs_data, - *rhs_padded_strides, - rhs_data, - *fdm_output_strides, - output_data, - func, - N); + _BinaryElementWise + <<>>( + output_rank_or_simple_broadcast, + *lhs_padded_strides, + lhs_data, + *rhs_padded_strides, + rhs_data, + *fdm_output_strides, + output_data, + func, + N); else if (lhs_padded_strides && lhs_padded_strides->Size()) - _BinaryElementWise<<>>( - output_rank_or_simple_broadcast, - *lhs_padded_strides, - lhs_data, - TArray(), // rhs is not computed, so no need to deference rhs_padded_strides - rhs_data, - *fdm_output_strides, - output_data, - func, - N); + _BinaryElementWise + <<>>( + output_rank_or_simple_broadcast, + *lhs_padded_strides, + lhs_data, + TArray(), // rhs is not computed, so no need to deference rhs_padded_strides + rhs_data, + *fdm_output_strides, + output_data, + func, + N); else if (rhs_padded_strides && rhs_padded_strides->Size()) - _BinaryElementWise<<>>( - output_rank_or_simple_broadcast, - TArray(), // lhs is not computed, so no need to deference lhs_padded_strides - lhs_data, - *rhs_padded_strides, - rhs_data, - *fdm_output_strides, - output_data, - func, - N); + _BinaryElementWise + <<>>( + output_rank_or_simple_broadcast, + TArray(), // lhs is not computed, so no need to deference lhs_padded_strides + lhs_data, + *rhs_padded_strides, + rhs_data, + *fdm_output_strides, + output_data, + func, + N); } } +template +void BinaryElementWiseImpl( + cudaStream_t stream, + int32_t output_rank_or_simple_broadcast, + const TArray* lhs_padded_strides, + const T1* lhs_data, + const TArray* rhs_padded_strides, + const T2* rhs_data, + const TArray* fdm_output_strides, + const fast_divmod& fdm_H, + const fast_divmod& fdm_C, + T* output_data, + const FuncT& func, + size_t count) { +#define FUNC_CALL(NumElemT) \ + _BinaryElementWiseImpl(stream, output_rank_or_simple_broadcast, \ + lhs_padded_strides, lhs_data, rhs_padded_strides, rhs_data, \ + fdm_output_strides, fdm_H, fdm_C, output_data, func, static_cast(count)); + + if (count > static_cast(std::numeric_limits::max())) { + FUNC_CALL(size_t) + } else { + FUNC_CALL(CUDA_LONG) + } +#undef FUNC_CALL +} // namespace cuda } // namespace cuda } // namespace onnxruntime diff --git a/onnxruntime/core/providers/cuda/cuda_call.cc b/onnxruntime/core/providers/cuda/cuda_call.cc index f60684795a4bc..c73b23f3762ed 100644 --- a/onnxruntime/core/providers/cuda/cuda_call.cc +++ b/onnxruntime/core/providers/cuda/cuda_call.cc @@ -103,7 +103,7 @@ std::conditional_t CudaCall( if (gethostname(hostname, HOST_NAME_MAX) != 0) strcpy(hostname, "?"); #endif - int currentCudaDevice; + int currentCudaDevice = -1; cudaGetDevice(¤tCudaDevice); cudaGetLastError(); // clear last CUDA error static char str[1024]; diff --git a/onnxruntime/core/providers/rocm/rocm_call.cc b/onnxruntime/core/providers/rocm/rocm_call.cc index 484e59f4de7d8..7974053c32497 100644 --- a/onnxruntime/core/providers/rocm/rocm_call.cc +++ b/onnxruntime/core/providers/rocm/rocm_call.cc @@ -104,7 +104,7 @@ std::conditional_t RocmCall( if (gethostname(hostname, HOST_NAME_MAX) != 0) strcpy(hostname, "?"); #endif - int currentHipDevice; + int currentHipDevice = -1; ORT_IGNORE_RETURN_VALUE(hipGetDevice(¤tHipDevice)); // void to silence nodiscard ORT_IGNORE_RETURN_VALUE(hipGetLastError()); // clear last ROCM error; void to silence nodiscard static char str[1024]; diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_utils.h b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_utils.h index a54b728c17c44..df12d90338782 100644 --- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_utils.h +++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_utils.h @@ -8,7 +8,6 @@ #include #include #include -#include #include "flatbuffers/idl.h" #include "ort_trt_int8_cal_table.fbs.h" #include @@ -16,7 +15,7 @@ #include "core/common/path_string.h" #include "core/framework/murmurhash3.h" -namespace fs = std::experimental::filesystem; +namespace fs = std::filesystem; namespace onnxruntime { diff --git a/onnxruntime/core/providers/vitisai/imp/global_api.cc b/onnxruntime/core/providers/vitisai/imp/global_api.cc index 00fb8c1578ff4..a963e656c457b 100644 --- a/onnxruntime/core/providers/vitisai/imp/global_api.cc +++ b/onnxruntime/core/providers/vitisai/imp/global_api.cc @@ -356,10 +356,18 @@ vaip_core::OrtApiForVaip* create_org_api_hook() { the_global_api.tensor_proto_get_shape_unsafe = vaip::tensor_proto_get_shape; the_global_api.tensor_proto_data_type = [](const ONNX_NAMESPACE::TensorProto& t) -> int { return t.data_type(); }; the_global_api.tensor_proto_delete = [](ONNX_NAMESPACE::TensorProto* tp) { delete tp; }; - the_global_api.tensor_proto_new_floats = vaip::tensor_proto_new_floats; + the_global_api.tensor_proto_new_i8 = vaip::tensor_proto_new_i8; + the_global_api.tensor_proto_new_i16 = vaip::tensor_proto_new_i16; the_global_api.tensor_proto_new_i32 = vaip::tensor_proto_new_i32; the_global_api.tensor_proto_new_i64 = vaip::tensor_proto_new_i64; - the_global_api.tensor_proto_new_i8 = vaip::tensor_proto_new_i8; + the_global_api.tensor_proto_new_u8 = vaip::tensor_proto_new_u8; + the_global_api.tensor_proto_new_u16 = vaip::tensor_proto_new_u16; + the_global_api.tensor_proto_new_u32 = vaip::tensor_proto_new_u32; + the_global_api.tensor_proto_new_u64 = vaip::tensor_proto_new_u64; + the_global_api.tensor_proto_new_floats = vaip::tensor_proto_new_floats; + the_global_api.tensor_proto_new_doubles = vaip::tensor_proto_new_doubles; + the_global_api.tensor_proto_new_bf16 = vaip::tensor_proto_new_bf16; + the_global_api.tensor_proto_new_fp16 = vaip::tensor_proto_new_fp16; the_global_api.tensor_proto_raw_data_size = [](const auto& tensor) { return tensor.raw_data().size(); }; the_global_api.tensor_proto_as_raw = vaip::tensor_proto_as_raw; the_global_api.tensor_proto_get_name = [](const auto& tensor) -> const std::string& { return tensor.name(); }; diff --git a/onnxruntime/core/providers/vitisai/imp/tensor_proto.cc b/onnxruntime/core/providers/vitisai/imp/tensor_proto.cc index 671d852abb0d6..63aa1daf7e18f 100644 --- a/onnxruntime/core/providers/vitisai/imp/tensor_proto.cc +++ b/onnxruntime/core/providers/vitisai/imp/tensor_proto.cc @@ -50,28 +50,67 @@ static ONNX_NAMESPACE::TensorProto* tensor_proto_new(const std::string& name, co return tensor_proto.release(); } +ONNX_NAMESPACE::TensorProto* tensor_proto_new_i8(const std::string& name, const std::vector& shape, + const std::vector& data) { + return tensor_proto_new(name, shape, ONNX_NAMESPACE::TensorProto_DataType_INT8, + reinterpret_cast(&data[0]), data.size() * sizeof(data[0])); +} + +ONNX_NAMESPACE::TensorProto* tensor_proto_new_i16(const std::string& name, const std::vector& shape, + const std::vector& data) { + return tensor_proto_new(name, shape, ONNX_NAMESPACE::TensorProto_DataType_INT16, + reinterpret_cast(&data[0]), data.size() * sizeof(data[0])); +} ONNX_NAMESPACE::TensorProto* tensor_proto_new_i32(const std::string& name, const std::vector& shape, const std::vector& data) { return tensor_proto_new(name, shape, ONNX_NAMESPACE::TensorProto_DataType_INT32, - reinterpret_cast(&data[0]), data.size() * sizeof(int32_t)); + reinterpret_cast(&data[0]), data.size() * sizeof(data[0])); } - ONNX_NAMESPACE::TensorProto* tensor_proto_new_i64(const std::string& name, const std::vector& shape, const std::vector& data) { - return tensor_proto_new(name, shape, ONNX_NAMESPACE::TensorProto_DataType_INT64, - reinterpret_cast(&data[0]), data.size() * sizeof(int64_t)); + return tensor_proto_new(name, shape, ONNX_NAMESPACE::TensorProto_DataType_INT32, + reinterpret_cast(&data[0]), data.size() * sizeof(data[0])); } - -ONNX_NAMESPACE::TensorProto* tensor_proto_new_i8(const std::string& name, const std::vector& shape, - const std::vector& data) { - return tensor_proto_new(name, shape, ONNX_NAMESPACE::TensorProto_DataType_INT8, - reinterpret_cast(&data[0]), data.size() * sizeof(int8_t)); +ONNX_NAMESPACE::TensorProto* tensor_proto_new_u8(const std::string& name, const std::vector& shape, + const std::vector& data) { + return tensor_proto_new(name, shape, ONNX_NAMESPACE::TensorProto_DataType_UINT8, + reinterpret_cast(&data[0]), data.size() * sizeof(data[0])); +} +ONNX_NAMESPACE::TensorProto* tensor_proto_new_u16(const std::string& name, const std::vector& shape, + const std::vector& data) { + return tensor_proto_new(name, shape, ONNX_NAMESPACE::TensorProto_DataType_UINT16, + reinterpret_cast(&data[0]), data.size() * sizeof(data[0])); +} +ONNX_NAMESPACE::TensorProto* tensor_proto_new_u32(const std::string& name, const std::vector& shape, + const std::vector& data) { + return tensor_proto_new(name, shape, ONNX_NAMESPACE::TensorProto_DataType_UINT32, + reinterpret_cast(&data[0]), data.size() * sizeof(data[0])); +} +ONNX_NAMESPACE::TensorProto* tensor_proto_new_u64(const std::string& name, const std::vector& shape, + const std::vector& data) { + return tensor_proto_new(name, shape, ONNX_NAMESPACE::TensorProto_DataType_UINT32, + reinterpret_cast(&data[0]), data.size() * sizeof(data[0])); } ONNX_NAMESPACE::TensorProto* tensor_proto_new_floats(const std::string& name, const std::vector& shape, const std::vector& data) { return tensor_proto_new(name, shape, ONNX_NAMESPACE::TensorProto_DataType_FLOAT, - reinterpret_cast(&data[0]), data.size() * sizeof(float)); + reinterpret_cast(&data[0]), data.size() * sizeof(data[0])); +} +ONNX_NAMESPACE::TensorProto* tensor_proto_new_doubles(const std::string& name, const std::vector& shape, + const std::vector& data) { + return tensor_proto_new(name, shape, ONNX_NAMESPACE::TensorProto_DataType_DOUBLE, + reinterpret_cast(&data[0]), data.size() * sizeof(data[0])); } +ONNX_NAMESPACE::TensorProto* tensor_proto_new_bf16(const std::string& name, const std::vector& shape, + const std::vector& data) { + return tensor_proto_new(name, shape, ONNX_NAMESPACE::TensorProto_DataType_BFLOAT16, + reinterpret_cast(&data[0]), data.size() * sizeof(data[0])); +} +ONNX_NAMESPACE::TensorProto* tensor_proto_new_fp16(const std::string& name, const std::vector& shape, + const std::vector& data) { + return tensor_proto_new(name, shape, ONNX_NAMESPACE::TensorProto_DataType_FLOAT16, + reinterpret_cast(&data[0]), data.size() * sizeof(data[0])); +} } // namespace vaip diff --git a/onnxruntime/core/providers/vitisai/imp/tensor_proto.h b/onnxruntime/core/providers/vitisai/imp/tensor_proto.h index 292905ca734f1..417f9d2f4bf31 100644 --- a/onnxruntime/core/providers/vitisai/imp/tensor_proto.h +++ b/onnxruntime/core/providers/vitisai/imp/tensor_proto.h @@ -11,10 +11,26 @@ vaip_core::DllSafe> tensor_proto_get_shape(const ONNX_NAMES const std::string& tensor_proto_get_name(const ONNX_NAMESPACE::TensorProto& tensor); ONNX_NAMESPACE::TensorProto* tensor_proto_new_i8(const std::string& name, const std::vector& shape, const std::vector& data); +ONNX_NAMESPACE::TensorProto* tensor_proto_new_u8(const std::string& name, const std::vector& shape, + const std::vector& data); +ONNX_NAMESPACE::TensorProto* tensor_proto_new_i16(const std::string& name, const std::vector& shape, + const std::vector& data); +ONNX_NAMESPACE::TensorProto* tensor_proto_new_u16(const std::string& name, const std::vector& shape, + const std::vector& data); +ONNX_NAMESPACE::TensorProto* tensor_proto_new_u32(const std::string& name, const std::vector& shape, + const std::vector& data); ONNX_NAMESPACE::TensorProto* tensor_proto_new_i32(const std::string& name, const std::vector& shape, const std::vector& data); +ONNX_NAMESPACE::TensorProto* tensor_proto_new_u64(const std::string& name, const std::vector& shape, + const std::vector& data); ONNX_NAMESPACE::TensorProto* tensor_proto_new_i64(const std::string& name, const std::vector& shape, const std::vector& data); ONNX_NAMESPACE::TensorProto* tensor_proto_new_floats(const std::string& name, const std::vector& shape, const std::vector& data); +ONNX_NAMESPACE::TensorProto* tensor_proto_new_bf16(const std::string& name, const std::vector& shape, + const std::vector& data); +ONNX_NAMESPACE::TensorProto* tensor_proto_new_fp16(const std::string& name, const std::vector& shape, + const std::vector& data); +ONNX_NAMESPACE::TensorProto* tensor_proto_new_doubles(const std::string& name, const std::vector& shape, + const std::vector& data); } // namespace vaip diff --git a/onnxruntime/core/providers/vitisai/include/vaip/vaip_ort_api.h b/onnxruntime/core/providers/vitisai/include/vaip/vaip_ort_api.h index 2c12d26fd2c31..62a7bb602e7e8 100644 --- a/onnxruntime/core/providers/vitisai/include/vaip/vaip_ort_api.h +++ b/onnxruntime/core/providers/vitisai/include/vaip/vaip_ort_api.h @@ -12,7 +12,7 @@ struct OrtApi; namespace vaip_core { -#define VAIP_ORT_API_MAJOR (2u) +#define VAIP_ORT_API_MAJOR (3u) #define VAIP_ORT_API_MINOR (0u) #define VAIP_ORT_API_PATCH (0u) struct OrtApiForVaip { @@ -198,6 +198,31 @@ struct OrtApiForVaip { DllSafe (*get_lib_name)(); // [81] /** new API after 2.0 */ void (*graph_add_initialized_tensor)(Graph& graph, const TensorProto& tensor); // [82] + /** new API after 3.0 */ + TensorProto* (*tensor_proto_new_doubles)( + const std::string& name, const std::vector& shape, + const std::vector& data); // [83] + TensorProto* (*tensor_proto_new_i16)( + const std::string& name, const std::vector& shape, + const std::vector& data); // [84 + TensorProto* (*tensor_proto_new_u16)( + const std::string& name, const std::vector& shape, + const std::vector& data); // [84] + TensorProto* (*tensor_proto_new_u32)( + const std::string& name, const std::vector& shape, + const std::vector& data); // [85] + TensorProto* (*tensor_proto_new_u8)(const std::string& name, + const std::vector& shape, + const std::vector& data); // [86] + TensorProto* (*tensor_proto_new_u64)( + const std::string& name, const std::vector& shape, + const std::vector& data); // [87] + TensorProto* (*tensor_proto_new_fp16)( + const std::string& name, const std::vector& shape, + const std::vector& data); // [88] + TensorProto* (*tensor_proto_new_bf16)( + const std::string& name, const std::vector& shape, + const std::vector& data); // [89] }; #ifndef USE_VITISAI diff --git a/onnxruntime/core/providers/webnn/builders/helper.h b/onnxruntime/core/providers/webnn/builders/helper.h index 486f7f69be15c..7c84a14ba7d88 100644 --- a/onnxruntime/core/providers/webnn/builders/helper.h +++ b/onnxruntime/core/providers/webnn/builders/helper.h @@ -157,8 +157,8 @@ std::vector> GetSupportedNodes(const GraphViewer& graph_v static const InlinedHashMap op_map = { {"Abs", {"abs", true}}, {"Add", {"add", true}}, - {"ArgMax", {"argMax", false}}, - {"ArgMin", {"argMin", false}}, + {"ArgMax", {"argMax", true}}, + {"ArgMin", {"argMin", true}}, {"AveragePool", {"averagePool2d", true}}, {"BatchNormalization", {"batchNormalization", false}}, {"Cast", {"cast", false}}, diff --git a/onnxruntime/core/providers/webnn/builders/impl/activation_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/activation_op_builder.cc index 163c9b0fb91d3..af0f0133b497a 100644 --- a/onnxruntime/core/providers/webnn/builders/impl/activation_op_builder.cc +++ b/onnxruntime/core/providers/webnn/builders/impl/activation_op_builder.cc @@ -20,7 +20,7 @@ class ActivationOpBuilder : public BaseOpBuilder { // Operator support related. bool IsOpSupportedImpl(const InitializedTensorSet& initializers, const Node& node, - WebnnDeviceType /* device_type */, const logging::Logger& logger) const override; + WebnnDeviceType device_type, const logging::Logger& logger) const override; bool HasSupportedInputsImpl(const Node& node, const WebnnDeviceType device_type, const logging::Logger& logger) const override; }; @@ -72,14 +72,24 @@ Status ActivationOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, // Operator support related. bool ActivationOpBuilder::IsOpSupportedImpl(const InitializedTensorSet& /* initializers */, const Node& node, - WebnnDeviceType /* device_type */, + WebnnDeviceType device_type, const logging::Logger& logger) const { const auto& input_defs = node.InputDefs(); + const auto& op_type = node.OpType(); std::vector input_shape; if (!GetShape(*input_defs[0], input_shape, logger)) return false; + if (op_type == "Elu" && device_type == WebnnDeviceType::CPU) { + NodeAttrHelper helper(node); + float alpha = helper.Get("alpha", 1.0f); + if (alpha != 1.0f) { + LOGS(logger, VERBOSE) << "WebNN CPU backend only supports Elu's alpha == 1.0"; + return false; + } + } + return true; } diff --git a/onnxruntime/core/providers/webnn/builders/impl/argmax_min_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/argmax_min_op_builder.cc index 7926311f3c4e6..f8b77b6350a76 100644 --- a/onnxruntime/core/providers/webnn/builders/impl/argmax_min_op_builder.cc +++ b/onnxruntime/core/providers/webnn/builders/impl/argmax_min_op_builder.cc @@ -21,7 +21,7 @@ class ArgMaxMinOpBuilder : public BaseOpBuilder { // Operator support related. bool IsOpSupportedImpl(const InitializedTensorSet& initializers, const Node& node, - WebnnDeviceType /* device_type */, const logging::Logger& logger) const override; + WebnnDeviceType device_type, const logging::Logger& logger) const override; bool HasSupportedInputsImpl(const Node& node, const WebnnDeviceType device_type, const logging::Logger& logger) const override; }; @@ -68,7 +68,7 @@ Status ArgMaxMinOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, // Operator support related. bool ArgMaxMinOpBuilder::IsOpSupportedImpl(const InitializedTensorSet& /* initializers */, const Node& node, - WebnnDeviceType /* device_type */, + WebnnDeviceType device_type, const logging::Logger& logger) const { const auto& input_defs = node.InputDefs(); @@ -76,6 +76,15 @@ bool ArgMaxMinOpBuilder::IsOpSupportedImpl(const InitializedTensorSet& /* initia if (!GetShape(*input_defs[0], input_shape, logger)) return false; + // WebNN CPU backend only supports select_last_index = 0. + if (device_type == WebnnDeviceType::CPU) { + NodeAttrHelper helper(node); + const auto select_last_index = helper.Get("select_last_index", 0); + if (select_last_index) { + LOGS(logger, VERBOSE) << "ArgMax/ArgMin with select_last_index = 1 is not supported on WebNN CPU backend."; + return false; + } + } return true; } diff --git a/onnxruntime/core/providers/webnn/builders/impl/clip_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/clip_op_builder.cc index 30848b666003d..e6403a4cd12dc 100644 --- a/onnxruntime/core/providers/webnn/builders/impl/clip_op_builder.cc +++ b/onnxruntime/core/providers/webnn/builders/impl/clip_op_builder.cc @@ -24,7 +24,7 @@ class ClipOpBuilder : public BaseOpBuilder { // Operator support related. private: bool IsOpSupportedImpl(const InitializedTensorSet& initializers, const Node& node, - const WebnnDeviceType /* device_type */, const logging::Logger& logger) const override; + const WebnnDeviceType device_type, const logging::Logger& logger) const override; bool HasSupportedInputsImpl(const Node& node, const WebnnDeviceType device_type, const logging::Logger& logger) const override; }; @@ -64,13 +64,33 @@ Status ClipOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, bool ClipOpBuilder::IsOpSupportedImpl(const InitializedTensorSet& initializers, const Node& node, - const WebnnDeviceType /* device_type */, + const WebnnDeviceType device_type, const logging::Logger& logger) const { // TODO: Update IsOpSupportedImpl to pass GraphViewer instead of InitializedTensorSet so the implementations // can ensure initializers are constant. See #19401 for details of how this update was made to the NNAPI EP. // GetClipMinMax(graph_viewer, node, minValue, maxValue, logger) float min, max; - return GetClipMinMax(initializers, node, min, max, logger); + if (GetClipMinMax(initializers, node, min, max, logger)) { + // WebNN CPU backend only supports 3 specific ranges: [0.0, infinity], [-1.0, 1.0], [0.0, 6.0]. + // TODO: Remove this workaround once the associated issue is resolved in Chromium: + // https://issues.chromium.org/issues/326156496. + if (device_type == WebnnDeviceType::CPU) { + if ((min == 0.0f && max == std::numeric_limits::infinity()) || + (min == -1.0f && max == 1.0f) || + (min == 0.0f && max == 6.0f)) { + return true; + } else { + LOGS(logger, VERBOSE) << "Clip min and max values (" + << min << ", " + << max << ") are not supported for WebNN CPU backend"; + return false; + } + } + + return true; + } else { + return false; + }; } bool ClipOpBuilder::HasSupportedInputsImpl(const Node& node, const WebnnDeviceType device_type, diff --git a/onnxruntime/core/providers/webnn/builders/impl/concat_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/concat_op_builder.cc index d3fa00e5fe32b..e4f98b09e03c5 100644 --- a/onnxruntime/core/providers/webnn/builders/impl/concat_op_builder.cc +++ b/onnxruntime/core/providers/webnn/builders/impl/concat_op_builder.cc @@ -36,40 +36,14 @@ Status ConcatOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, NodeAttrHelper helper(node); uint32_t axis = static_cast(HandleNegativeAxis(helper.Get("axis", 1), rank)); - const size_t num_inputs = input_defs.size(); std::vector inputs; for (const auto* input : input_defs) { LOGS(logger, VERBOSE) << "input name " << input->Name(); inputs.push_back(model_builder.GetOperand(input->Name())); } - emscripten::val output = emscripten::val::undefined(); - if (num_inputs <= 4 || model_builder.GetPreferredLayout() == DataLayout::NCHW) { - output = model_builder.GetBuilder().call("concat", emscripten::val::array(inputs), axis); - } else { - // WebNN XNNPack backend only supports the concat with inputs number <= 4, - // decomposing the Concat with inputs number > 4 into multiple WebNN concat ops. - size_t remaining_inputs = num_inputs; - size_t max_inputs = 4; - while (remaining_inputs > 0) { - std::vector chunk_inputs; - - // Push the last concated output to the next chunk_inputs. - if (output != emscripten::val::undefined()) { - chunk_inputs.push_back(output); - max_inputs = 3; - } - - size_t chunk_size = std::min(remaining_inputs, max_inputs); - - for (size_t i = 0; i < chunk_size; i++) { - chunk_inputs.push_back(inputs[num_inputs - remaining_inputs + i]); - } - - output = model_builder.GetBuilder().call("concat", emscripten::val::array(chunk_inputs), axis); - remaining_inputs -= chunk_size; - } - } + emscripten::val output = + model_builder.GetBuilder().call("concat", emscripten::val::array(inputs), axis); model_builder.AddOperand(node.OutputDefs()[0]->Name(), std::move(output)); return Status::OK(); diff --git a/onnxruntime/core/providers/webnn/builders/impl/gemm_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/gemm_op_builder.cc index 248463f473b2e..53f885019ab2f 100644 --- a/onnxruntime/core/providers/webnn/builders/impl/gemm_op_builder.cc +++ b/onnxruntime/core/providers/webnn/builders/impl/gemm_op_builder.cc @@ -23,7 +23,7 @@ class GemmOpBuilder : public BaseOpBuilder { // Operator support related. private: - bool IsOpSupportedImpl(const InitializedTensorSet& initializers, const Node& node, + bool IsOpSupportedImpl(const InitializedTensorSet& /* initializers */, const Node& node, const WebnnDeviceType /* device_type */, const logging::Logger& logger) const override; bool HasSupportedInputsImpl(const Node& node, const WebnnDeviceType /* device_type */, const logging::Logger& logger) const override; @@ -64,13 +64,9 @@ Status GemmOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const N b = model_builder.GetBuilder().call("reshape", b, emscripten::val::array(GetVecUint32FromVecInt64(b_shape))); } - // The inputs of MatMul must be at least 3D for WebNN CPU backend. Use GEMM for 2D case. - // TODO: Remove this workaround when it is fixed in Chromium. - if (model_builder.GetWebnnDeviceType() == WebnnDeviceType::CPU && a_shape.size() == 2) { - output = model_builder.GetBuilder().call("gemm", a, b); - } else { - output = model_builder.GetBuilder().call("matmul", a, b); - } + + output = model_builder.GetBuilder().call("matmul", a, b); + // If the inputs are both 1D, reduce the output to a scalar. if (extended_a_shape && extended_b_shape) { output = model_builder.GetBuilder().call("reshape", output, emscripten::val::array()); @@ -132,11 +128,10 @@ Status GemmOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const N // Operator support related. -bool GemmOpBuilder::IsOpSupportedImpl(const InitializedTensorSet& initializers, +bool GemmOpBuilder::IsOpSupportedImpl(const InitializedTensorSet& /* initializers */, const Node& node, - const WebnnDeviceType device_type, + const WebnnDeviceType /* device_type */, const logging::Logger& logger) const { - (void)initializers; const auto& op_type = node.OpType(); const auto& input_defs(node.InputDefs()); const size_t a_idx = 0, b_idx = 1, c_idx = 2; // A*B+C @@ -194,30 +189,6 @@ bool GemmOpBuilder::IsOpSupportedImpl(const InitializedTensorSet& initializers, } } - if (op_type == "MatMul") { - // If the first argument is 1-D, it is promoted to a matrix by prepending a 1 to its dimensions. - // If the second argument is 1-D, it is promoted to a matrix by appending a 1 to its dimensions. - if (a_shape.size() == 1) a_shape.insert(a_shape.begin(), 1); - if (b_shape.size() == 1) b_shape.push_back(1); - - // WebNN CPU backend has two more constraints. - // https://source.chromium.org/chromium/chromium/src/+/main:third_party/blink/renderer/modules/ml/webnn/ml_graph_xnnpack.cc;l=1177 - // TODO: Remove this workaround when Chromium enables broadcast for MatMul on WebNN CPU backend. - if (device_type == WebnnDeviceType::CPU) { - if (a_shape.size() != b_shape.size()) { - LOGS(logger, VERBOSE) << "The rank of two inputs for WebNN CPU backend MatMul must be the same."; - return false; - } - - for (size_t i = 0; i < a_shape.size() - 2; i++) { - if (a_shape[i] != b_shape[i]) { - LOGS(logger, VERBOSE) << "WebNN CPU backend can't support broadcasting for MatMul."; - return false; - } - } - } - } - return true; } diff --git a/onnxruntime/core/providers/webnn/builders/impl/resize_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/resize_op_builder.cc index ea54b70a66677..c4ca980fec715 100644 --- a/onnxruntime/core/providers/webnn/builders/impl/resize_op_builder.cc +++ b/onnxruntime/core/providers/webnn/builders/impl/resize_op_builder.cc @@ -30,7 +30,7 @@ class ResizeOpBuilder : public BaseOpBuilder { // Operator support related. private: bool IsOpSupportedImpl(const InitializedTensorSet& initializers, const Node& node, - const WebnnDeviceType device_type, const logging::Logger& logger) const override; + const WebnnDeviceType /* device_type */, const logging::Logger& logger) const override; // Resize opset 10- is very different than Resize opset 11+, with many key attributes missing. // We only support Resize opset 11+ here. @@ -164,7 +164,7 @@ Status ResizeOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, bool ResizeOpBuilder::IsOpSupportedImpl(const InitializedTensorSet& initializers, const Node& node, - const WebnnDeviceType device_type, + const WebnnDeviceType /* device_type */, const logging::Logger& logger) const { const auto& input_defs = node.InputDefs(); @@ -184,18 +184,10 @@ bool ResizeOpBuilder::IsOpSupportedImpl(const InitializedTensorSet& initializers const auto mode = helper.Get("mode", "nearest"); bool is_linear_resize = mode == "linear"; bool is_nearest_resize = mode == "nearest"; - // WebNN CPU backend only supports "linear" mode. - // WebNN GPU backend only supports "linear" and "nearest" modes. - if (device_type == WebnnDeviceType::CPU) { - if (!is_linear_resize) { - LOGS(logger, VERBOSE) << "Resize unsupported input mode, " << mode << " for CPU backend."; - return false; - } - } else { - if (!is_linear_resize && !is_nearest_resize) { - LOGS(logger, VERBOSE) << "Resize unsupported input mode, " << mode << " for GPU backend."; - return false; - } + // WebNN only supports "linear" and "nearest" modes. + if (!is_linear_resize && !is_nearest_resize) { + LOGS(logger, VERBOSE) << "Resize does not support input mode: " << mode; + return false; } const auto exclude_outside = helper.Get("exclude_outside", 0); diff --git a/onnxruntime/core/providers/webnn/builders/impl/split_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/split_op_builder.cc index c50b678bf2386..ea3b8ef384ddc 100644 --- a/onnxruntime/core/providers/webnn/builders/impl/split_op_builder.cc +++ b/onnxruntime/core/providers/webnn/builders/impl/split_op_builder.cc @@ -27,7 +27,7 @@ class SplitOpBuilder : public BaseOpBuilder { // Operator support related. private: bool IsOpSupportedImpl(const InitializedTensorSet& initializers, const Node& node, - const WebnnDeviceType device_type, const logging::Logger& logger) const override; + const WebnnDeviceType /* device_type */, const logging::Logger& logger) const override; }; // Add operator related. @@ -94,7 +94,7 @@ Status SplitOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, bool SplitOpBuilder::IsOpSupportedImpl(const InitializedTensorSet& initializers, const Node& node, - const WebnnDeviceType device_type, + const WebnnDeviceType /* device_type */, const logging::Logger& logger) const { const auto& input_defs = node.InputDefs(); std::vector input_shape; @@ -126,10 +126,6 @@ bool SplitOpBuilder::IsOpSupportedImpl(const InitializedTensorSet& initializers, LOGS(logger, VERBOSE) << "Cannot get split."; return false; } - if (split.size() > 4 && device_type == WebnnDeviceType::CPU) { - LOGS(logger, VERBOSE) << "WebNN CPU backend only supports up to 4 outputs."; - return false; - } } else { if (helper.HasAttr("num_outputs")) { // Split has 'num_outputs' attribute when opset is 18. @@ -138,10 +134,6 @@ bool SplitOpBuilder::IsOpSupportedImpl(const InitializedTensorSet& initializers, LOGS(logger, VERBOSE) << "The 'num_outputs' must be a positive integer."; return false; } - if (num_outputs > 4 && device_type == WebnnDeviceType::CPU) { - LOGS(logger, VERBOSE) << "WebNN CPU backend only supports up to 4 outputs."; - return false; - } } else { const auto opset = node.SinceVersion(); if (opset >= 18) { diff --git a/onnxruntime/core/providers/webnn/builders/model_builder.h b/onnxruntime/core/providers/webnn/builders/model_builder.h index 8c1848eb833c1..80077b3abe56d 100644 --- a/onnxruntime/core/providers/webnn/builders/model_builder.h +++ b/onnxruntime/core/providers/webnn/builders/model_builder.h @@ -53,7 +53,7 @@ class ModelBuilder { void AddInitializerToSkip(const std::string& tensor_name); // There are some input which will not be used, add it to a list which will not - // be added to CoreML model, since CoreML does not like input unused. + // be added to WebNN model, since WebNN does not like input unused. void AddInputToSkip(const std::string& input_name); std::string GetUniqueName(const std::string& base_name); diff --git a/onnxruntime/core/session/provider_bridge_ort.cc b/onnxruntime/core/session/provider_bridge_ort.cc index d18b3ac40d489..7f7ed5e436afe 100644 --- a/onnxruntime/core/session/provider_bridge_ort.cc +++ b/onnxruntime/core/session/provider_bridge_ort.cc @@ -2099,22 +2099,36 @@ ORT_API_STATUS_IMPL(OrtSessionOptionsAppendExecutionProvider_CUDA, _In_ OrtSessi return OrtApis::SessionOptionsAppendExecutionProvider_CUDA(options, &provider_options); } -ORT_API_STATUS_IMPL(OrtApis::SetCurrentGpuDeviceId, _In_ int device_id) { +ORT_API_STATUS_IMPL(OrtApis::SetCurrentGpuDeviceId, [[maybe_unused]] _In_ int device_id) { API_IMPL_BEGIN + +#ifdef USE_CUDA if (auto* info = onnxruntime::TryGetProviderInfo_CUDA()) return info->SetCurrentGpuDeviceId(device_id); +#endif + +#ifdef USE_ROCM if (auto* info = onnxruntime::TryGetProviderInfo_ROCM()) return info->SetCurrentGpuDeviceId(device_id); +#endif + return CreateStatus(ORT_FAIL, "CUDA and/or ROCM execution provider is either not enabled or not available."); API_IMPL_END } -ORT_API_STATUS_IMPL(OrtApis::GetCurrentGpuDeviceId, _In_ int* device_id) { +ORT_API_STATUS_IMPL(OrtApis::GetCurrentGpuDeviceId, [[maybe_unused]] _In_ int* device_id) { API_IMPL_BEGIN + +#ifdef USE_CUDA if (auto* info = onnxruntime::TryGetProviderInfo_CUDA()) return info->GetCurrentGpuDeviceId(device_id); +#endif + +#ifdef USE_ROCM if (auto* info = onnxruntime::TryGetProviderInfo_ROCM()) return info->GetCurrentGpuDeviceId(device_id); +#endif + return CreateStatus(ORT_FAIL, "CUDA and/or ROCM execution provider is either not enabled or not available."); API_IMPL_END } diff --git a/onnxruntime/python/onnxruntime_pybind_state.cc b/onnxruntime/python/onnxruntime_pybind_state.cc index 7f9a6e13d7864..b1784f700d1fa 100644 --- a/onnxruntime/python/onnxruntime_pybind_state.cc +++ b/onnxruntime/python/onnxruntime_pybind_state.cc @@ -1066,13 +1066,12 @@ std::unique_ptr CreateExecutionProviderInstance( #endif } else if (type == kVitisAIExecutionProvider) { #ifdef USE_VITISAI + ProviderOptions info{}; const auto it = provider_options_map.find(type); - if (it == provider_options_map.end()) { - LOGS_DEFAULT(FATAL) << "cannot find provider options for VitisAIExecutionProvider"; + if (it != provider_options_map.end()) { + info = it->second; } - const auto& vitis_option_map = it->second; - return onnxruntime::VitisAIProviderFactoryCreator::Create(vitis_option_map) - ->CreateProvider(); + return onnxruntime::VitisAIProviderFactoryCreator::Create(info)->CreateProvider(); #endif } else if (type == kAclExecutionProvider) { #ifdef USE_ACL diff --git a/onnxruntime/python/tools/quantization/base_quantizer.py b/onnxruntime/python/tools/quantization/base_quantizer.py index 74e213fa61362..06d2ce30b9b37 100644 --- a/onnxruntime/python/tools/quantization/base_quantizer.py +++ b/onnxruntime/python/tools/quantization/base_quantizer.py @@ -25,6 +25,7 @@ find_by_name, model_has_infer_metadata, normalize_axis, + pack_bytes_to_4bit, quantize_data, quantize_nparray, save_and_reload_model_with_shape_infer, @@ -340,13 +341,17 @@ def quantize_initializer_impl(self, weight, qType, reduce_range=False, keep_floa f"\nraw={str(q_weight_initializer)[:200]}." ) elif qType in (onnx.TensorProto.INT4, onnx.TensorProto.UINT4): - # TODO: Use simpler make_tensor call when ONNX bug that does not store negative weights packed - # within int32_data is fixed. - # q_weight_initializer = onnx.helper.make_tensor(q_weight_name, qType, weight.dims, q_weight_data) - packed_data = onnx.helper.pack_float32_to_4bit(q_weight_data.flatten(), qType == onnx.TensorProto.INT4) - q_weight_initializer = onnx.helper.make_tensor( - q_weight_name, qType, weight.dims, packed_data.tobytes(), raw=True - ) + if q_weight_data.dtype not in (np.int8, np.uint8): + raise RuntimeError( + f"Quantized weights for {q_weight_name} must be 8-bit before packing as 4-bit values." + ) + + # We do not use onnx.helper.pack_float32_to_4bit() due to performance. + # This can be the difference between a large model taking 30 minutes to quantize vs 5 minutes. + packed_data = bytes(pack_bytes_to_4bit(q_weight_data.tobytes())) + + # We only use onnx.helper.make_tensor with raw data due to bug: https://github.com/onnx/onnx/pull/6161 + q_weight_initializer = onnx.helper.make_tensor(q_weight_name, qType, weight.dims, packed_data, raw=True) else: q_weight_data = np.asarray(q_weight_data, dtype=onnx.helper.tensor_dtype_to_np_dtype(qType)).reshape( weight.dims @@ -483,16 +488,18 @@ def quantize_weight_per_channel_impl( if not keep_float_weight: if weight_qType in (onnx.TensorProto.INT4, onnx.TensorProto.UINT4): - # TODO: Use simpler make_tensor call when ONNX bug that does not store negative weights packed - # within int32_data is fixed. - # q_weight_initializer = onnx.helper.make_tensor( - # q_weight_name, weight_qType, weights_shape, quantized_weights - # ) - packed_data = onnx.helper.pack_float32_to_4bit( - quantized_weights.flatten(), weight_qType == onnx.TensorProto.INT4 - ) + if quantized_weights.dtype not in (np.int8, np.uint8): + raise RuntimeError( + f"Quantized weights for {q_weight_name} must be 8-bit before packing as 4-bit values." + ) + + # We do not use onnx.helper.pack_float32_to_4bit() due to performance. + # This can be the difference between a large model taking 30 minutes to quantize vs 5 minutes. + packed_data = bytes(pack_bytes_to_4bit(quantized_weights.tobytes())) + + # We only use onnx.helper.make_tensor with raw data due to bug: https://github.com/onnx/onnx/pull/6161 q_weight_initializer = onnx.helper.make_tensor( - q_weight_name, weight_qType, weights_shape, packed_data.tobytes(), raw=True + q_weight_name, weight_qType, weights_shape, packed_data, raw=True ) self.model.initializer_extend([q_weight_initializer]) else: diff --git a/onnxruntime/python/tools/quantization/quant_utils.py b/onnxruntime/python/tools/quantization/quant_utils.py index bdf6d5a355206..53d2eaeaba70b 100644 --- a/onnxruntime/python/tools/quantization/quant_utils.py +++ b/onnxruntime/python/tools/quantization/quant_utils.py @@ -21,10 +21,18 @@ from onnxruntime import GraphOptimizationLevel, InferenceSession, SessionOptions try: - from onnx.reference.custom_element_types import float8e4m3fn, int4, uint4 + from onnx.reference.custom_element_types import float8e4m3fn except ImportError: float8e4m3fn = None +# INT4 np.dtypes added in ONNX 1.16. These map to np.int8/np.uint8 because numpy +# does not support sub-byte types. +try: + from onnx.reference.custom_element_types import int4, uint4 +except ImportError: + int4 = None + uint4 = None + __producer__ = "onnx.quantize" __version__ = "0.1.0" @@ -134,8 +142,8 @@ def from_string(format): onnx_proto.TensorProto.INT16: numpy.dtype("int16"), onnx_proto.TensorProto.UINT16: numpy.dtype("uint16"), onnx_proto.TensorProto.FLOAT8E4M3FN: float8e4m3fn, - onnx_proto.TensorProto.INT4: int4, - onnx_proto.TensorProto.UINT4: uint4, + onnx_proto.TensorProto.INT4: int4, # base_dtype is np.int8 + onnx_proto.TensorProto.UINT4: uint4, # base_dtype is np.uint8 } ONNX_INT_TYPE_RANGE = { @@ -212,36 +220,12 @@ def quantize_nparray(qType, arr, scale, zero_point, low=None, high=None): ) ref = ReferenceEvaluator(onnx_model) return _check_type(ref.run(None, {"X": arr, "scale": scale})[0]) - elif qType in ( - onnx_proto.TensorProto.INT4, - onnx_proto.TensorProto.UINT4, - ): - if arr.dtype == numpy.float32: - onnx_type = TensorProto.FLOAT - elif arr.dtype == numpy.float16: - onnx_type = TensorProto.FLOAT16 - else: - raise ValueError(f"Unexpected dtype {arr.dtype}.") - onnx_model = make_model( - make_graph( - [ - make_node("QuantizeLinear", ["X", "scale", "zero_point"], ["Y"]), - ], - "qu", - [ - make_tensor_value_info("X", onnx_type, None), - make_tensor_value_info("scale", onnx_type, None), - make_tensor_value_info("zero_point", qType, None), - ], - [make_tensor_value_info("Y", qType, None)], - ) - ) - # The reference ONNX implementation of QuantizeLinear returns "unpacked" int8 numpy values - # because numpy cannot represent 4bit values (although ONNX TensorProto has no problem with this). - # These "unpacked" int8 values are correctly re-packed when passed to onnx.make_tensor(). - ref = ReferenceEvaluator(onnx_model) - return _check_type(ref.run(None, {"X": arr, "scale": scale, "zero_point": zero_point})[0]) else: + # Quantizes data for all integer types. + # + # For int4 types, the quantized data is returned as either np.int8 or np.uint8, + # which matches the python reference ONNX implementation of QuantizeLinear. + # This data can be packed into 4-bit elements by using pack_bytes_to_4bit(). dtype = ONNX_TYPE_TO_NP_TYPE[qType] (qmin, qmax) = get_qmin_qmax_for_qType(qType, reduce_range=False, symmetric=True) @@ -482,6 +466,36 @@ def normalize_axis(axis: int, rank: int) -> tuple[bool, int]: return is_valid, axis_norm +def pack_bytes_to_4bit(src_8bit: bytes) -> bytearray: + """ + Copies a source array of 8-bit values into a destination bytearray of packed 4-bit values. + Assumes that the source values are already in the appropriate int4 range. + :parameter src_8bit: The 8-bit element values to pack. + :return A bytearray with every two 8-bit src elements packed into a single byte. + """ + num_elems = len(src_8bit) + if num_elems == 0: + return bytearray() + + dst_size = (num_elems + 1) // 2 # Ex: 5 8-bit elems packed into 3 bytes + dst = bytearray(dst_size) + + src_i: int = 0 + dst_i: int = 0 + + # Pack two 8-bit elements into a single byte in each iteration. + while src_i < num_elems - 1: + dst[dst_i] = ((src_8bit[src_i + 1] & 0xF) << 4) | (src_8bit[src_i] & 0xF) + dst_i += 1 + src_i += 2 + + if src_i < num_elems: + # Odd number of elements. + dst[dst_i] = src_8bit[src_i] & 0xF + + return dst + + class QuantizedInitializer: """ Represents a linearly quantized weight input from ONNX operators diff --git a/onnxruntime/python/tools/transformers/models/llama/requirements.txt b/onnxruntime/python/tools/transformers/models/llama/requirements.txt index ce4b3f6a09ba5..388025165f814 100644 --- a/onnxruntime/python/tools/transformers/models/llama/requirements.txt +++ b/onnxruntime/python/tools/transformers/models/llama/requirements.txt @@ -1,7 +1,7 @@ optimum>=1.14.1 transformers>=4.33.2,<= 4.37.2 torch>=2.2.0 -onnx==1.16.0 +onnx==1.16.1 datasets>=2.8.0 protobuf==3.20.2 -psutil \ No newline at end of file +psutil diff --git a/onnxruntime/python/tools/transformers/models/phi2/requirements.txt b/onnxruntime/python/tools/transformers/models/phi2/requirements.txt index 0b2ea0df93a96..c82022e798482 100644 --- a/onnxruntime/python/tools/transformers/models/phi2/requirements.txt +++ b/onnxruntime/python/tools/transformers/models/phi2/requirements.txt @@ -1,3 +1,3 @@ -onnx==1.16.0 +onnx==1.16.1 transformers>=4.36.2 onnxscript>=0.1.0.dev20240126 diff --git a/onnxruntime/python/tools/transformers/models/stable_diffusion/README.md b/onnxruntime/python/tools/transformers/models/stable_diffusion/README.md index 8607485bc265b..9c1c31626066d 100644 --- a/onnxruntime/python/tools/transformers/models/stable_diffusion/README.md +++ b/onnxruntime/python/tools/transformers/models/stable_diffusion/README.md @@ -36,16 +36,18 @@ cd onnxruntime Install nvidia-docker using [these instructions](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html#docker). ``` -docker run --rm -it --gpus all -v $PWD:/workspace nvcr.io/nvidia/pytorch:23.10-py3 /bin/bash +docker run --rm -it --gpus all -v $PWD:/workspace nvcr.io/nvidia/pytorch:24.04-py3 /bin/bash ``` #### Build onnxruntime from source +The cuDNN in the container might not be compatible with official onnxruntime-gpu package, it is recommended to build from source instead. + After launching the docker, you can build and install onnxruntime-gpu wheel like the following. ``` -export CUDACXX=/usr/local/cuda-12.2/bin/nvcc +export CUDACXX=/usr/local/cuda/bin/nvcc git config --global --add safe.directory '*' -sh build.sh --config Release --build_shared_lib --parallel --use_cuda --cuda_version 12.2 \ - --cuda_home /usr/local/cuda-12.2 --cudnn_home /usr/lib/x86_64-linux-gnu/ --build_wheel --skip_tests \ +sh build.sh --config Release --build_shared_lib --parallel --use_cuda --cuda_version 12.4 \ + --cuda_home /usr/local/cuda --cudnn_home /usr/lib/x86_64-linux-gnu/ --build_wheel --skip_tests \ --use_tensorrt --tensorrt_home /usr/src/tensorrt \ --cmake_extra_defines onnxruntime_BUILD_UNIT_TESTS=OFF \ --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=80 \ diff --git a/onnxruntime/python/tools/transformers/models/stable_diffusion/requirements-cuda11.txt b/onnxruntime/python/tools/transformers/models/stable_diffusion/requirements-cuda11.txt index 447cb54f98ed2..dc6592fc2fa54 100644 --- a/onnxruntime/python/tools/transformers/models/stable_diffusion/requirements-cuda11.txt +++ b/onnxruntime/python/tools/transformers/models/stable_diffusion/requirements-cuda11.txt @@ -6,7 +6,7 @@ onnxruntime-gpu>=1.16.2 py3nvml # The version of cuda-python shall be compatible with installed CUDA version. -# For example, if your CUDA version is 12.1, you can install cuda-python 12.1. +# For demo of TensorRT excution provider and TensortRT. cuda-python==11.8.0 # For windows, cuda-python need the following diff --git a/onnxruntime/python/tools/transformers/models/stable_diffusion/requirements-cuda12.txt b/onnxruntime/python/tools/transformers/models/stable_diffusion/requirements-cuda12.txt index 1ff0e3c1cf5af..4aa88cdf92309 100644 --- a/onnxruntime/python/tools/transformers/models/stable_diffusion/requirements-cuda12.txt +++ b/onnxruntime/python/tools/transformers/models/stable_diffusion/requirements-cuda12.txt @@ -6,7 +6,7 @@ py3nvml # The version of cuda-python shall be compatible with installed CUDA version. -# For example, if your CUDA version is 12.1, you can install cuda-python 12.1. +# For demo of TensorRT excution provider and TensortRT. cuda-python>=12.1.0 # For windows, cuda-python need the following diff --git a/onnxruntime/python/tools/transformers/models/stable_diffusion/requirements.txt b/onnxruntime/python/tools/transformers/models/stable_diffusion/requirements.txt index 0798b659306b5..de242e77cdb2e 100644 --- a/onnxruntime/python/tools/transformers/models/stable_diffusion/requirements.txt +++ b/onnxruntime/python/tools/transformers/models/stable_diffusion/requirements.txt @@ -1,8 +1,8 @@ -diffusers==0.24.0 -transformers==4.38.0 +diffusers==0.28.0 +transformers==4.41.2 numpy>=1.24.1 accelerate -onnx==1.14.1 +onnx==1.16.0 coloredlogs packaging # Use newer version of protobuf might cause crash @@ -11,7 +11,7 @@ psutil sympy controlnet_aux==0.0.7 # The following are for SDXL -optimum==1.14.1 +optimum==1.20.0 safetensors invisible_watermark # newer version of opencv-python migth encounter module 'cv2.dnn' has no attribute 'DictValue' error diff --git a/onnxruntime/python/tools/transformers/models/stable_diffusion/test/check_image.py b/onnxruntime/python/tools/transformers/models/stable_diffusion/test/check_image.py index 9a3615c1cbeca..86477a7e3168b 100644 --- a/onnxruntime/python/tools/transformers/models/stable_diffusion/test/check_image.py +++ b/onnxruntime/python/tools/transformers/models/stable_diffusion/test/check_image.py @@ -64,11 +64,11 @@ def main(): score = round(generate_score(image1, image2, cache_dir), 2) print("similarity Score: ", {score}) if args.negative: - if score > 97: + if score > 95: print("Why generated this incorrect image") raise SystemExit(1) else: - if score < 97: + if score < 95: print(f"{image1} and {image2} are different") raise SystemExit(1) else: diff --git a/onnxruntime/python/tools/transformers/models/whisper/requirements.txt b/onnxruntime/python/tools/transformers/models/whisper/requirements.txt index a722c13e80766..689b14ea9a684 100644 --- a/onnxruntime/python/tools/transformers/models/whisper/requirements.txt +++ b/onnxruntime/python/tools/transformers/models/whisper/requirements.txt @@ -7,7 +7,7 @@ soundfile librosa optimum onnxruntime-extensions>=0.9.0 -onnx==1.16.0 +onnx==1.16.1 protobuf==3.20.2 numpy==1.23.3 psutil diff --git a/onnxruntime/test/onnx/TestCase.cc b/onnxruntime/test/onnx/TestCase.cc index 1d54a3cfae9bf..6d3e9c2cb7865 100644 --- a/onnxruntime/test/onnx/TestCase.cc +++ b/onnxruntime/test/onnx/TestCase.cc @@ -1381,6 +1381,11 @@ std::unique_ptr> GetBrokenTests(const std::string& provider // expected 13.5 (41580000), got 0 (0), diff: 13.5, tol=0.0145 idx=3. 3 of 4 differ broken_tests->insert({"averagepool_2d_ceil", "result differs"}); #endif + // These next 3 Resize tests fail on CPU backend with QNN SDK 2.22.0 due to inaccuracy. + // output=Y:expected 1 (3f800000), got 3 (40400000), diff: 2, tol=0.002 idx=24. 8 of 56 differ + broken_tests->insert({"resize_upsample_sizes_nearest", "result differs"}); + broken_tests->insert({"resize_upsample_sizes_nearest_axes_2_3", "result differs"}); + broken_tests->insert({"resize_upsample_sizes_nearest_axes_3_2", "result differs"}); } #ifdef DISABLE_CONTRIB_OPS diff --git a/onnxruntime/test/onnx/microbenchmark/eigen.cc b/onnxruntime/test/onnx/microbenchmark/eigen.cc index 29894316edd01..230a57740d448 100644 --- a/onnxruntime/test/onnx/microbenchmark/eigen.cc +++ b/onnxruntime/test/onnx/microbenchmark/eigen.cc @@ -1,3 +1,5 @@ +#include "onnxruntime_config.h" + #if defined(__GNUC__) && !defined(__clang__) #pragma GCC diagnostic push #if __GNUC__ >= 6 @@ -6,6 +8,15 @@ #pragma GCC diagnostic ignored "-Wunused-parameter" #pragma GCC diagnostic ignored "-Wunused-result" #pragma GCC diagnostic ignored "-Wmaybe-uninitialized" + +// _deps/eigen-src/unsupported/Eigen/CXX11/../../../Eigen/src/Core/arch/NEON/PacketMath.h:1671:9: +// error: ‘void* memcpy(void*, const void*, size_t)’ copying an object of non-trivial type ‘Eigen::internal::Packet4c’ +// {aka ‘struct Eigen::internal::eigen_packet_wrapper’} from an array of ‘const int8_t’ +// {aka ‘const signed char’} [-Werror=class-memaccess] +#ifdef HAS_CLASS_MEMACCESS +#pragma GCC diagnostic ignored "-Wclass-memaccess" +#endif + #elif defined(_MSC_VER) // build\windows\debug\external\eigen3\unsupported\eigen\cxx11\src/Tensor/Tensor.h(76): // warning C4554: '&': check operator precedence for possible error; use parentheses to clarify precedence diff --git a/onnxruntime/test/providers/cpu/math/element_wise_ops_test.cc b/onnxruntime/test/providers/cpu/math/element_wise_ops_test.cc index fd9d222ec8904..eb3575f2cde88 100644 --- a/onnxruntime/test/providers/cpu/math/element_wise_ops_test.cc +++ b/onnxruntime/test/providers/cpu/math/element_wise_ops_test.cc @@ -484,6 +484,22 @@ TEST(MathOpTest, Add_Invalid_Broadcast) { {}, nullptr, &execution_providers); } +// TEST(MathOpTest, Add_large_dimension) { +// OpTester test("Add"); + +// int64_t num_elem = static_cast(std::numeric_limits::max()) + 1000; +// // int64_t num_elem = static_cast(200) + 1000; +// float input_scalar{4.0f}; +// std::vector input_sequence(num_elem, 0), output_sequence(num_elem, input_scalar); +// test.AddInput("A", {num_elem}, input_sequence); +// test.AddInput("B", {1}, {input_scalar}); +// test.AddOutput("C", {num_elem}, output_sequence); + +// std::vector> execution_providers; +// execution_providers.push_back(DefaultCudaExecutionProvider()); +// test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers); +// } + TEST(MathOpTest, Sub_int32) { OpTester test("Sub"); test.AddInput("A", {3}, {1, 4, 3}); diff --git a/onnxruntime/test/providers/cpu/math/matmul_test.cc b/onnxruntime/test/providers/cpu/math/matmul_test.cc index 24340e69c13c2..82f6914d08199 100644 --- a/onnxruntime/test/providers/cpu/math/matmul_test.cc +++ b/onnxruntime/test/providers/cpu/math/matmul_test.cc @@ -163,22 +163,15 @@ void RunMatMulTest(int32_t opset_version, bool is_a_constant, bool is_b_constant // OpenVINO EP: Disabled temporarily matmul broadcasting not fully supported // Disable TensorRT because of unsupported data type - std::unordered_set excluded_providers{kTensorrtExecutionProvider, kOpenVINOExecutionProvider}; + // QNN EP: Crash during graph execution for QNN's CPU backend on QNN SDK 2.22. Not a problem for QNN's HTP backend. + std::unordered_set excluded_providers{kTensorrtExecutionProvider, + kOpenVINOExecutionProvider, + kQnnExecutionProvider}; if (t.name == "test 2D empty input") { // NNAPI: currently fails for the "test 2D empty input" case excluded_providers.insert(kNnapiExecutionProvider); } - if ("test padding and broadcast A > B" == t.name || "test 2D empty input" == t.name) { - // QNN can't handle 0 shap - excluded_providers.insert(kQnnExecutionProvider); - } -#if defined(__linux__) - if (t.name == "test padding and broadcast B > A") { - // Accuracy error with QNN SDK 2.17.0 on CPU backend. - excluded_providers.insert(kQnnExecutionProvider); - } -#endif test.ConfigExcludeEps(excluded_providers) .Config(run_with_tunable_op) .RunWithConfig(); diff --git a/onnxruntime/test/providers/cpu/tensor/quantize_linear_test.cc b/onnxruntime/test/providers/cpu/tensor/quantize_linear_test.cc index 5eeda5a3b8949..054dcfc75b92e 100644 --- a/onnxruntime/test/providers/cpu/tensor/quantize_linear_test.cc +++ b/onnxruntime/test/providers/cpu/tensor/quantize_linear_test.cc @@ -794,5 +794,760 @@ TEST(QuantizeLinearOpMLFloat16Test, Float8) { #endif +namespace blocked_dequantization { + +template +void DequantizeLinearOp21BlockedTest_InvalidBlockSize_Int(int64_t block_size, + int64_t scale_block_count, + int64_t zero_point_block_count) { + OpTester test("DequantizeLinear", 21); + std::vector dims{2, 4}; + std::vector x_scale, y; + std::vector x, x_zero_point; + SessionOptions so; + std::vector log_msgs; // redirect error messages + std::vector> eps; + eps.push_back(DefaultCpuExecutionProvider()); + so.user_logging_function = [](void* param, OrtLoggingLevel severity, const char* category, + const char* logid, const char* code_location, const char* message) { + ORT_UNUSED_PARAMETER(severity); + ORT_UNUSED_PARAMETER(category); + ORT_UNUSED_PARAMETER(logid); + ORT_UNUSED_PARAMETER(code_location); + std::vector* v_ptr = reinterpret_cast*>(param); + std::vector& msg_vector = *v_ptr; + msg_vector.push_back(std::string(message)); + }; + so.user_logging_param = &log_msgs; + so.session_logid = "DequantizeLinear"; + so.use_per_session_threads = false; + so.session_log_verbosity_level = 1; + so.graph_optimization_level = TransformerLevel::Default; + + for (int64_t i = 0, n = 2 * zero_point_block_count; i < n; ++i) x_zero_point.push_back(0); + for (int64_t i = 0, n = 2 * scale_block_count; i < n; i++) x_scale.push_back(Tout(2.0f)); + for (int i = 0; i < 8; ++i) { + x.push_back(i); + y.push_back(Tout(static_cast(i) * 2.0f)); + } + + test.AddInput("x", dims, x); + test.AddAttribute("axis", 1); + test.AddAttribute("block_size", block_size); + test.AddInput("x_scale", {2, scale_block_count}, x_scale); + test.AddInput("x_zero_point", {2, zero_point_block_count}, x_zero_point); + test.AddOutput("y", dims, y); + test.Run(so, OpTester::ExpectResult::kExpectFailure, "", {}, nullptr, &eps); +} + +template +void DequantizeLinearOp21BlockedTest_InvalidBlockSize_Int4(int64_t block_size, + int64_t scale_block_count, + int64_t zero_point_block_count) { + OpTester test("DequantizeLinear", 21); + std::vector dims{2, 4}; + std::vector x_scale, y; + std::vector x, x_zero_point; + SessionOptions so; + std::vector log_msgs; // redirect error messages + std::vector> eps; + eps.push_back(DefaultCpuExecutionProvider()); + so.user_logging_function = [](void* param, OrtLoggingLevel severity, const char* category, + const char* logid, const char* code_location, const char* message) { + ORT_UNUSED_PARAMETER(severity); + ORT_UNUSED_PARAMETER(category); + ORT_UNUSED_PARAMETER(logid); + ORT_UNUSED_PARAMETER(code_location); + std::vector* v_ptr = reinterpret_cast*>(param); + std::vector& msg_vector = *v_ptr; + msg_vector.push_back(std::string(message)); + }; + so.user_logging_param = &log_msgs; + so.session_logid = "DequantizeLinear"; + so.use_per_session_threads = false; + so.session_log_verbosity_level = 1; + so.graph_optimization_level = TransformerLevel::Default; + + for (int64_t i = 0, n = zero_point_block_count; i < n; ++i) x_zero_point.push_back(Tin(0, 0)); + for (int64_t i = 0, n = 2 * scale_block_count; i < n; i++) x_scale.push_back(Tout(2.0f)); + for (int i = 0; i < 8; ++i) { + if (i & 1) x.push_back(Tin(i - 1, i)); + y.push_back(Tout(static_cast(i) * 2.0f)); + } + + test.AddInput("x", dims, x); + test.AddAttribute("axis", 1); + test.AddAttribute("block_size", block_size); + test.AddInput("x_scale", {2, scale_block_count}, x_scale); + test.AddInput("x_zero_point", {2, zero_point_block_count}, x_zero_point); + test.AddOutput("y", dims, y); + test.Run(so, OpTester::ExpectResult::kExpectFailure, "", {}, nullptr, &eps); +} + +template +void DequantizeLinearOp21BlockedTest_InvalidBlockSize_Float8(int64_t block_size, + int64_t scale_block_count, + int64_t zero_point_block_count) { + OpTester test("DequantizeLinear", 21); + std::vector dims{2, 4}; + std::vector x_scale, y; + std::vector x, x_zero_point; + SessionOptions so; + std::vector log_msgs; // redirect error messages + std::vector> eps; + eps.push_back(DefaultCpuExecutionProvider()); + so.user_logging_function = [](void* param, OrtLoggingLevel severity, const char* category, + const char* logid, const char* code_location, const char* message) { + ORT_UNUSED_PARAMETER(severity); + ORT_UNUSED_PARAMETER(category); + ORT_UNUSED_PARAMETER(logid); + ORT_UNUSED_PARAMETER(code_location); + std::vector* v_ptr = reinterpret_cast*>(param); + std::vector& msg_vector = *v_ptr; + msg_vector.push_back(std::string(message)); + }; + so.user_logging_param = &log_msgs; + so.session_logid = "DequantizeLinear"; + so.use_per_session_threads = false; + so.session_log_verbosity_level = 1; + so.graph_optimization_level = TransformerLevel::Default; + + for (int64_t i = 0, n = 2 * zero_point_block_count; i < n; i++) x_zero_point.push_back(Tin(0.0f)); + for (int64_t i = 0, n = 2 * scale_block_count; i < n; i++) x_scale.push_back(Tout(2.0f)); + for (int i = 0; i < 8; ++i) x.push_back(Tin(static_cast(i))); + for (int i = 0; i < 8; ++i) y.push_back(Tout(static_cast(i) * 2.0f)); + + test.AddInput("x", dims, x); + test.AddAttribute("axis", 1); + test.AddAttribute("block_size", block_size); + test.AddInput("x_scale", {2, scale_block_count}, x_scale); + test.AddInput("x_zero_point", {2, zero_point_block_count}, x_zero_point); + test.AddOutput("y", dims, y); + test.Run(so, OpTester::ExpectResult::kExpectFailure, "", {}, nullptr, &eps); +} + +// test negative block size fail +TEST(DequantizeLinearOp21BlockedTest, NagativeBlockSize_Int) { + DequantizeLinearOp21BlockedTest_InvalidBlockSize_Int4(-1, 2, 2); + DequantizeLinearOp21BlockedTest_InvalidBlockSize_Int4(-1, 2, 2); + DequantizeLinearOp21BlockedTest_InvalidBlockSize_Int4(-2, 2, 2); + DequantizeLinearOp21BlockedTest_InvalidBlockSize_Int4(-2, 2, 2); + DequantizeLinearOp21BlockedTest_InvalidBlockSize_Int(-3, 2, 2); + DequantizeLinearOp21BlockedTest_InvalidBlockSize_Int(-3, 2, 2); + DequantizeLinearOp21BlockedTest_InvalidBlockSize_Int(-4, 2, 2); + DequantizeLinearOp21BlockedTest_InvalidBlockSize_Int(-4, 2, 2); + DequantizeLinearOp21BlockedTest_InvalidBlockSize_Int(-5, 2, 2); + DequantizeLinearOp21BlockedTest_InvalidBlockSize_Int(-5, 2, 2); + DequantizeLinearOp21BlockedTest_InvalidBlockSize_Int(-6, 2, 2); + DequantizeLinearOp21BlockedTest_InvalidBlockSize_Int(-1, 2, 2); + DequantizeLinearOp21BlockedTest_InvalidBlockSize_Int(-1, 2, 2); + DequantizeLinearOp21BlockedTest_InvalidBlockSize_Int(-1, 2, 2); +} + +#if !defined(DISABLE_FLOAT8_TYPES) +TEST(DequantizeLinearOp21BlockedTest, NagativeBlockSize_Float8) { + constexpr int min_cuda_architecture = 11080; + bool enable_cuda = (nullptr != DefaultCpuExecutionProvider().get()) && HasCudaEnvironment(min_cuda_architecture); + bool enable_cpu = (nullptr != DefaultCpuExecutionProvider().get()); + + if (enable_cpu || enable_cuda) { + DequantizeLinearOp21BlockedTest_InvalidBlockSize_Float8(-1, 2, 2); + DequantizeLinearOp21BlockedTest_InvalidBlockSize_Float8(-2, 2, 2); + } + if (enable_cpu) { + DequantizeLinearOp21BlockedTest_InvalidBlockSize_Float8(-3, 2, 2); + DequantizeLinearOp21BlockedTest_InvalidBlockSize_Float8(-4, 2, 2); + } + if (enable_cpu || enable_cuda) { + DequantizeLinearOp21BlockedTest_InvalidBlockSize_Float8(-5, 2, 2); + DequantizeLinearOp21BlockedTest_InvalidBlockSize_Float8(-6, 2, 2); + } + if (enable_cpu) { + DequantizeLinearOp21BlockedTest_InvalidBlockSize_Float8(-1, 2, 2); + DequantizeLinearOp21BlockedTest_InvalidBlockSize_Float8(-1, 2, 2); + } +} +#endif + +// test block size incompatible with x_scale shape fail +TEST(DequantizeLinearOp21BlockedTest, IncompatibleBlockSizeWithX_Int) { + DequantizeLinearOp21BlockedTest_InvalidBlockSize_Int4(3, 1, 1); + DequantizeLinearOp21BlockedTest_InvalidBlockSize_Int4(3, 3, 3); + DequantizeLinearOp21BlockedTest_InvalidBlockSize_Int4(3, 3, 3); + DequantizeLinearOp21BlockedTest_InvalidBlockSize_Int4(3, 1, 1); + DequantizeLinearOp21BlockedTest_InvalidBlockSize_Int(3, 1, 1); + DequantizeLinearOp21BlockedTest_InvalidBlockSize_Int(3, 3, 3); + DequantizeLinearOp21BlockedTest_InvalidBlockSize_Int(3, 3, 3); + DequantizeLinearOp21BlockedTest_InvalidBlockSize_Int(3, 1, 1); + DequantizeLinearOp21BlockedTest_InvalidBlockSize_Int(3, 1, 1); + DequantizeLinearOp21BlockedTest_InvalidBlockSize_Int(3, 3, 3); + DequantizeLinearOp21BlockedTest_InvalidBlockSize_Int(3, 3, 3); + DequantizeLinearOp21BlockedTest_InvalidBlockSize_Int(3, 1, 1); + DequantizeLinearOp21BlockedTest_InvalidBlockSize_Int(3, 3, 3); + DequantizeLinearOp21BlockedTest_InvalidBlockSize_Int(3, 1, 1); +} + +#if !defined(DISABLE_FLOAT8_TYPES) +TEST(DequantizeLinearOp21BlockedTest, IncompatibleBlockSizeWithX_Float8) { + constexpr int min_cuda_architecture = 11080; + bool enable_cuda = (nullptr != DefaultCpuExecutionProvider().get()) && HasCudaEnvironment(min_cuda_architecture); + bool enable_cpu = (nullptr != DefaultCpuExecutionProvider().get()); + + if (enable_cpu || enable_cuda) { + DequantizeLinearOp21BlockedTest_InvalidBlockSize_Float8(3, 1, 1); + DequantizeLinearOp21BlockedTest_InvalidBlockSize_Float8(3, 3, 3); + DequantizeLinearOp21BlockedTest_InvalidBlockSize_Float8(3, 1, 1); + DequantizeLinearOp21BlockedTest_InvalidBlockSize_Float8(3, 3, 3); + } + if (enable_cpu) { + DequantizeLinearOp21BlockedTest_InvalidBlockSize_Float8(3, 1, 1); + DequantizeLinearOp21BlockedTest_InvalidBlockSize_Float8(3, 3, 3); + DequantizeLinearOp21BlockedTest_InvalidBlockSize_Float8(3, 1, 1); + DequantizeLinearOp21BlockedTest_InvalidBlockSize_Float8(3, 3, 3); + } +} +#endif + +// test x_scale vs. x_zero_point shape incompatible fail +TEST(DequantizeLinearOp21BlockedTest, ScaleShapeUnmatchZeroPoint_Int) { + DequantizeLinearOp21BlockedTest_InvalidBlockSize_Int4(3, 2, 1); + DequantizeLinearOp21BlockedTest_InvalidBlockSize_Int4(3, 2, 3); + DequantizeLinearOp21BlockedTest_InvalidBlockSize_Int4(3, 2, 3); + DequantizeLinearOp21BlockedTest_InvalidBlockSize_Int4(3, 2, 1); + DequantizeLinearOp21BlockedTest_InvalidBlockSize_Int(3, 2, 1); + DequantizeLinearOp21BlockedTest_InvalidBlockSize_Int(3, 2, 3); + DequantizeLinearOp21BlockedTest_InvalidBlockSize_Int(3, 2, 3); + DequantizeLinearOp21BlockedTest_InvalidBlockSize_Int(3, 2, 1); + DequantizeLinearOp21BlockedTest_InvalidBlockSize_Int(3, 2, 1); + DequantizeLinearOp21BlockedTest_InvalidBlockSize_Int(3, 2, 3); + DequantizeLinearOp21BlockedTest_InvalidBlockSize_Int(3, 2, 3); + DequantizeLinearOp21BlockedTest_InvalidBlockSize_Int(3, 2, 1); + DequantizeLinearOp21BlockedTest_InvalidBlockSize_Int(3, 2, 3); + DequantizeLinearOp21BlockedTest_InvalidBlockSize_Int(3, 2, 1); +} + +#if !defined(DISABLE_FLOAT8_TYPES) +TEST(DequantizeLinearOp21BlockedTest, ScaleShapeUnmatchZeroPoint_Float8) { + constexpr int min_cuda_architecture = 11080; + bool enable_cuda = (nullptr != DefaultCpuExecutionProvider().get()) && HasCudaEnvironment(min_cuda_architecture); + bool enable_cpu = (nullptr != DefaultCpuExecutionProvider().get()); + + if (enable_cpu || enable_cuda) { + DequantizeLinearOp21BlockedTest_InvalidBlockSize_Float8(3, 2, 1); + DequantizeLinearOp21BlockedTest_InvalidBlockSize_Float8(3, 2, 3); + DequantizeLinearOp21BlockedTest_InvalidBlockSize_Float8(3, 2, 1); + DequantizeLinearOp21BlockedTest_InvalidBlockSize_Float8(3, 2, 3); + } + if (enable_cpu) { + DequantizeLinearOp21BlockedTest_InvalidBlockSize_Float8(3, 2, 1); + DequantizeLinearOp21BlockedTest_InvalidBlockSize_Float8(3, 2, 3); + DequantizeLinearOp21BlockedTest_InvalidBlockSize_Float8(3, 2, 1); + DequantizeLinearOp21BlockedTest_InvalidBlockSize_Float8(3, 2, 3); + } +} +#endif + +// test DQ with blocked quantization succeed +template +void DequantizeLinearOp21BlockedTest_Int4_Succeed(std::vector&& dims, + int64_t axis, + int64_t block_size, + std::vector& x_, + std::vector& x_scale_, + std::vector& x_zero_point_, + std::vector& y_) { + OpTester test("DequantizeLinear", 21); + std::vector x_scale_shape; + std::vector x_scale, y; + std::vector x, x_zero_point; + std::vector> eps; + eps.push_back(DefaultCpuExecutionProvider()); + + int64_t non_neg_axis = axis < 0 ? axis + dims.size() : axis; + bool use_zero_point = !x_zero_point_.empty(); + + for (auto v : y_) y.push_back(Tout(v)); + for (auto v : x_scale_) x_scale.push_back(Tout(v)); + for (size_t i = 0, n = dims.size(); i < n; ++i) { + x_scale_shape.push_back((int64_t)i == non_neg_axis ? (dims[i] + block_size - 1) / block_size : dims[i]); + } + + size_t i = 0, n = x_.size(); + for (; i < n - 1; i += 2) x.push_back(Tin(x_[i], x_[i + 1])); + if (i < n) x.push_back(Tin(x_[i], 0xF)); + + if (use_zero_point) { + i = 0, n = x_zero_point_.size(); + for (; i < n - 1; i += 2) x_zero_point.push_back(Tin(x_zero_point_[i], x_zero_point_[i + 1])); + if (i < n) x_zero_point.push_back(Tin(x_zero_point_[i], 0xF)); + } + + test.AddInput("x", dims, x); + test.AddAttribute("axis", axis); + test.AddAttribute("block_size", block_size); + test.AddInput("x_scale", x_scale_shape, x_scale); + if (use_zero_point) test.AddInput("x_zero_point", x_scale_shape, x_zero_point); + test.AddOutput("y", dims, y); + test.Run(BaseTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &eps); +} + +template +void DequantizeLinearOp21BlockedTest_Int_Succeed(std::vector&& dims, + int64_t axis, + int64_t block_size, + std::vector& x_, + std::vector& x_scale_, + std::vector& x_zero_point_, + std::vector& y_) { + OpTester test("DequantizeLinear", 21); + std::vector x_scale_shape; + std::vector x_scale, y; + std::vector x, x_zero_point; + std::vector> eps; + eps.push_back(DefaultCpuExecutionProvider()); + + int64_t non_neg_axis = axis < 0 ? axis + dims.size() : axis; + bool use_zero_point = !x_zero_point_.empty(); + + for (auto v : y_) y.push_back(Tout(v)); + for (auto v : x_scale_) x_scale.push_back(Tout(v)); + for (size_t i = 0, n = dims.size(); i < n; ++i) { + x_scale_shape.push_back((int64_t)i == non_neg_axis ? (dims[i] + block_size - 1) / block_size : dims[i]); + } + for (auto v : x_) x.push_back(v); + if (use_zero_point) + for (auto v : x_zero_point_) x_zero_point.push_back(v); + + test.AddInput("x", dims, x); + test.AddAttribute("axis", axis); + test.AddAttribute("block_size", block_size); + test.AddInput("x_scale", x_scale_shape, x_scale); + if (use_zero_point) test.AddInput("x_zero_point", x_scale_shape, x_zero_point); + test.AddOutput("y", dims, y); + test.Run(BaseTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &eps); +} + +template +void DequantizeLinearOp21BlockedTest_Float8_Succeed(std::vector&& dims, + int64_t axis, + int64_t block_size, + std::vector& x_, + std::vector& x_scale_, + std::vector& x_zero_point_, + std::vector& y_) { + OpTester test("DequantizeLinear", 21); + std::vector x_scale_shape; + std::vector x_scale, y; + std::vector x, x_zero_point; + std::vector> eps; + eps.push_back(DefaultCpuExecutionProvider()); + + int64_t non_neg_axis = axis < 0 ? axis + dims.size() : axis; + bool use_zero_point = !x_zero_point_.empty(); + + for (auto v : y_) y.push_back(Tout(v)); + for (auto v : x_scale_) x_scale.push_back(Tout(v)); + for (size_t i = 0, n = dims.size(); i < n; ++i) { + x_scale_shape.push_back((int64_t)i == non_neg_axis ? (dims[i] + block_size - 1) / block_size : dims[i]); + } + + for (auto v : x_) x.push_back(Tin(static_cast(v))); + if (use_zero_point) { + for (auto v : x_zero_point_) x_zero_point.push_back(Tin(static_cast(v))); + } + + test.AddInput("x", dims, x); + test.AddAttribute("axis", axis); + test.AddAttribute("block_size", block_size); + test.AddInput("x_scale", x_scale_shape, x_scale); + if (use_zero_point) test.AddInput("x_zero_point", x_scale_shape, x_zero_point); + test.AddOutput("y", dims, y); + test.Run(BaseTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &eps); +} + +TEST(DequantizeLinearOp21BlockedTest, SignedInt_NoZeroPoint_FirstAxis) { + std::vector x_scale{-2.0, -4.0, 3.5, 1.0, 2.0, 4.0, -3.5, -1.0}; + std::vector zero_point; + std::vector x{-7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, -8}; + std::vector y_2{14.0, 24.0, -17.5, -4.0, 6.0, 8.0, -3.5, 0.0, 2.0, 8.0, -10.5, -4.0, 10.0, 24.0, -24.5, 8.0}; + std::vector y_3{14.0, 24.0, -17.5, -4.0, 6.0, 8.0, -3.5, 0.0, -2.0, -8.0, 10.5, 4.0, 10.0, 24.0, -24.5, 8.0}; + + DequantizeLinearOp21BlockedTest_Int4_Succeed({4, 2, 2}, 0, 2, x, x_scale, zero_point, y_2); + DequantizeLinearOp21BlockedTest_Int4_Succeed({4, 2, 2}, 0, 3, x, x_scale, zero_point, y_3); + DequantizeLinearOp21BlockedTest_Int_Succeed({4, 2, 2}, 0, 2, x, x_scale, zero_point, y_2); + DequantizeLinearOp21BlockedTest_Int_Succeed({4, 2, 2}, 0, 3, x, x_scale, zero_point, y_3); + DequantizeLinearOp21BlockedTest_Int_Succeed({4, 2, 2}, 0, 2, x, x_scale, zero_point, y_2); + DequantizeLinearOp21BlockedTest_Int_Succeed({4, 2, 2}, 0, 3, x, x_scale, zero_point, y_3); + DequantizeLinearOp21BlockedTest_Int_Succeed({4, 2, 2}, 0, 2, x, x_scale, zero_point, y_2); + DequantizeLinearOp21BlockedTest_Int_Succeed({4, 2, 2}, 0, 3, x, x_scale, zero_point, y_3); + DequantizeLinearOp21BlockedTest_Int4_Succeed({4, 2, 2}, 0, 2, x, x_scale, zero_point, y_2); + DequantizeLinearOp21BlockedTest_Int4_Succeed({4, 2, 2}, 0, 3, x, x_scale, zero_point, y_3); + DequantizeLinearOp21BlockedTest_Int_Succeed({4, 2, 2}, 0, 2, x, x_scale, zero_point, y_2); + DequantizeLinearOp21BlockedTest_Int_Succeed({4, 2, 2}, 0, 3, x, x_scale, zero_point, y_3); + DequantizeLinearOp21BlockedTest_Int_Succeed({4, 2, 2}, 0, 2, x, x_scale, zero_point, y_2); + DequantizeLinearOp21BlockedTest_Int_Succeed({4, 2, 2}, 0, 3, x, x_scale, zero_point, y_3); + DequantizeLinearOp21BlockedTest_Int_Succeed({4, 2, 2}, 0, 2, x, x_scale, zero_point, y_2); + DequantizeLinearOp21BlockedTest_Int_Succeed({4, 2, 2}, 0, 3, x, x_scale, zero_point, y_3); +} + +TEST(DequantizeLinearOp21BlockedTest, SignedInt_UseZeroPoint_FirstAxis) { + std::vector x_scale{-2.0, -4.0, 3.5, 1.0, 2.0, 4.0, -3.5, -1.0}; + std::vector zero_point{-6, -4, -3, -1, 0, 2, 4, 7}; + std::vector x{-7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, -8}; + std::vector y_2{2.0, 8.0, -7.0, -3, -6.0, -8.0, 7.0, 1, 2.0, 0, 3.5, 3.0, 10.0, 16.0, -10.5, 15}; + std::vector y_3{2.0, 8.0, -7.0, -3, -6.0, -8.0, 7.0, 1, -14.0, -24, 21, 5, 10.0, 16.0, -10.5, 15}; + + DequantizeLinearOp21BlockedTest_Int4_Succeed({4, 2, 2}, 0, 2, x, x_scale, zero_point, y_2); + DequantizeLinearOp21BlockedTest_Int4_Succeed({4, 2, 2}, 0, 3, x, x_scale, zero_point, y_3); + DequantizeLinearOp21BlockedTest_Int_Succeed({4, 2, 2}, 0, 2, x, x_scale, zero_point, y_2); + DequantizeLinearOp21BlockedTest_Int_Succeed({4, 2, 2}, 0, 3, x, x_scale, zero_point, y_3); + DequantizeLinearOp21BlockedTest_Int_Succeed({4, 2, 2}, 0, 2, x, x_scale, zero_point, y_2); + DequantizeLinearOp21BlockedTest_Int_Succeed({4, 2, 2}, 0, 3, x, x_scale, zero_point, y_3); + DequantizeLinearOp21BlockedTest_Int4_Succeed({4, 2, 2}, 0, 2, x, x_scale, zero_point, y_2); + DequantizeLinearOp21BlockedTest_Int4_Succeed({4, 2, 2}, 0, 3, x, x_scale, zero_point, y_3); + DequantizeLinearOp21BlockedTest_Int_Succeed({4, 2, 2}, 0, 2, x, x_scale, zero_point, y_2); + DequantizeLinearOp21BlockedTest_Int_Succeed({4, 2, 2}, 0, 3, x, x_scale, zero_point, y_3); + DequantizeLinearOp21BlockedTest_Int_Succeed({4, 2, 2}, 0, 2, x, x_scale, zero_point, y_2); + DequantizeLinearOp21BlockedTest_Int_Succeed({4, 2, 2}, 0, 3, x, x_scale, zero_point, y_3); +} + +TEST(DequantizeLinearOp21BlockedTest, SignedInt_NoZeroPoint_MiddleAxis) { + std::vector zero_point{}; + std::vector x_scale{-2.0, -4.0, 3.5, 1.0, 2.0, 4.0, -3.5, -1.0}; + std::vector x{-7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, -8}; + std::vector y_2{14, 24, 10, 16, -10.5, -2, -3.5, 0, 2, 8, 6, 16, -17.5, -6, -24.5, 8}; + std::vector y_3{14, 24, 10, 16, 6, 8, -3.5, 0, 2, 8, 6, 16, 10, 24, -24.5, 8}; + + DequantizeLinearOp21BlockedTest_Int4_Succeed({2, 4, 2}, 1, 2, x, x_scale, zero_point, y_2); + DequantizeLinearOp21BlockedTest_Int4_Succeed({2, 4, 2}, 1, 3, x, x_scale, zero_point, y_3); + DequantizeLinearOp21BlockedTest_Int_Succeed({2, 4, 2}, 1, 2, x, x_scale, zero_point, y_2); + DequantizeLinearOp21BlockedTest_Int_Succeed({2, 4, 2}, 1, 3, x, x_scale, zero_point, y_3); + DequantizeLinearOp21BlockedTest_Int_Succeed({2, 4, 2}, 1, 2, x, x_scale, zero_point, y_2); + DequantizeLinearOp21BlockedTest_Int_Succeed({2, 4, 2}, 1, 3, x, x_scale, zero_point, y_3); + DequantizeLinearOp21BlockedTest_Int_Succeed({2, 4, 2}, 1, 2, x, x_scale, zero_point, y_2); + DequantizeLinearOp21BlockedTest_Int_Succeed({2, 4, 2}, 1, 3, x, x_scale, zero_point, y_3); + DequantizeLinearOp21BlockedTest_Int4_Succeed({2, 4, 2}, 1, 2, x, x_scale, zero_point, y_2); + DequantizeLinearOp21BlockedTest_Int4_Succeed({2, 4, 2}, 1, 3, x, x_scale, zero_point, y_3); + DequantizeLinearOp21BlockedTest_Int_Succeed({2, 4, 2}, 1, 2, x, x_scale, zero_point, y_2); + DequantizeLinearOp21BlockedTest_Int_Succeed({2, 4, 2}, 1, 3, x, x_scale, zero_point, y_3); + DequantizeLinearOp21BlockedTest_Int_Succeed({2, 4, 2}, 1, 2, x, x_scale, zero_point, y_2); + DequantizeLinearOp21BlockedTest_Int_Succeed({2, 4, 2}, 1, 3, x, x_scale, zero_point, y_3); + DequantizeLinearOp21BlockedTest_Int_Succeed({2, 4, 2}, 1, 2, x, x_scale, zero_point, y_2); + DequantizeLinearOp21BlockedTest_Int_Succeed({2, 4, 2}, 1, 3, x, x_scale, zero_point, y_3); +} + +TEST(DequantizeLinearOp21BlockedTest, SignedInt_UseZeroPoint_MiddleAxis) { + std::vector x_scale{-2.0, -4.0, 3.5, 1.0, 2.0, 4.0, -3.5, -1.0}; + std::vector zero_point{-6, -4, -3, -1, 0, 2, 4, 7}; + std::vector x{-7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, -8}; + std::vector y_2{2, 8, -2, 0, 0, -1, 7, 1, 2, 0, 6, 8, -3.5, 1, -10.5, 15}; + std::vector y_3{2, 8, -2, 0, -6, -8, 7, 1, 2, 0, 6, 8, 10, 16, -10.5, 15}; + + DequantizeLinearOp21BlockedTest_Int4_Succeed({2, 4, 2}, 1, 2, x, x_scale, zero_point, y_2); + DequantizeLinearOp21BlockedTest_Int4_Succeed({2, 4, 2}, 1, 3, x, x_scale, zero_point, y_3); + DequantizeLinearOp21BlockedTest_Int_Succeed({2, 4, 2}, 1, 2, x, x_scale, zero_point, y_2); + DequantizeLinearOp21BlockedTest_Int_Succeed({2, 4, 2}, 1, 3, x, x_scale, zero_point, y_3); + DequantizeLinearOp21BlockedTest_Int_Succeed({2, 4, 2}, 1, 2, x, x_scale, zero_point, y_2); + DequantizeLinearOp21BlockedTest_Int_Succeed({2, 4, 2}, 1, 3, x, x_scale, zero_point, y_3); + DequantizeLinearOp21BlockedTest_Int4_Succeed({2, 4, 2}, 1, 2, x, x_scale, zero_point, y_2); + DequantizeLinearOp21BlockedTest_Int4_Succeed({2, 4, 2}, 1, 3, x, x_scale, zero_point, y_3); + DequantizeLinearOp21BlockedTest_Int_Succeed({2, 4, 2}, 1, 2, x, x_scale, zero_point, y_2); + DequantizeLinearOp21BlockedTest_Int_Succeed({2, 4, 2}, 1, 3, x, x_scale, zero_point, y_3); + DequantizeLinearOp21BlockedTest_Int_Succeed({2, 4, 2}, 1, 2, x, x_scale, zero_point, y_2); + DequantizeLinearOp21BlockedTest_Int_Succeed({2, 4, 2}, 1, 3, x, x_scale, zero_point, y_3); +} + +TEST(DequantizeLinearOp21BlockedTest, SignedInt_NoZeroPoint_LastAxis) { + std::vector zero_point{}; + std::vector x_scale{-2.0, -4.0, 3.5, 1.0, 2.0, 4.0, -3.5, -1.0}; + std::vector x{-7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, -8}; + std::vector y_2{14, 12, 20, 16, -10.5, -7, -1, 0, 2, 4, 12, 16, -17.5, -21, -7, 8}; + std::vector y_3{14, 12, 10, 16, -10.5, -7, -3.5, 0, 2, 4, 6, 16, -17.5, -21, -24.5, 8}; + + DequantizeLinearOp21BlockedTest_Int4_Succeed({2, 2, 4}, 2, 2, x, x_scale, zero_point, y_2); + DequantizeLinearOp21BlockedTest_Int4_Succeed({2, 2, 4}, 2, 3, x, x_scale, zero_point, y_3); + DequantizeLinearOp21BlockedTest_Int_Succeed({2, 2, 4}, 2, 2, x, x_scale, zero_point, y_2); + DequantizeLinearOp21BlockedTest_Int_Succeed({2, 2, 4}, 2, 3, x, x_scale, zero_point, y_3); + DequantizeLinearOp21BlockedTest_Int_Succeed({2, 2, 4}, 2, 2, x, x_scale, zero_point, y_2); + DequantizeLinearOp21BlockedTest_Int_Succeed({2, 2, 4}, 2, 3, x, x_scale, zero_point, y_3); + DequantizeLinearOp21BlockedTest_Int_Succeed({2, 2, 4}, 2, 2, x, x_scale, zero_point, y_2); + DequantizeLinearOp21BlockedTest_Int_Succeed({2, 2, 4}, 2, 3, x, x_scale, zero_point, y_3); + DequantizeLinearOp21BlockedTest_Int4_Succeed({2, 2, 4}, 2, 2, x, x_scale, zero_point, y_2); + DequantizeLinearOp21BlockedTest_Int4_Succeed({2, 2, 4}, 2, 3, x, x_scale, zero_point, y_3); + DequantizeLinearOp21BlockedTest_Int_Succeed({2, 2, 4}, 2, 2, x, x_scale, zero_point, y_2); + DequantizeLinearOp21BlockedTest_Int_Succeed({2, 2, 4}, 2, 3, x, x_scale, zero_point, y_3); + DequantizeLinearOp21BlockedTest_Int_Succeed({2, 2, 4}, 2, 2, x, x_scale, zero_point, y_2); + DequantizeLinearOp21BlockedTest_Int_Succeed({2, 2, 4}, 2, 3, x, x_scale, zero_point, y_3); + DequantizeLinearOp21BlockedTest_Int_Succeed({2, 2, 4}, 2, 2, x, x_scale, zero_point, y_2); + DequantizeLinearOp21BlockedTest_Int_Succeed({2, 2, 4}, 2, 3, x, x_scale, zero_point, y_3); +} + +TEST(DequantizeLinearOp21BlockedTest, SignedInt_UseZeroPoint_LastAxis) { + std::vector x_scale{-2.0, -4.0, 3.5, 1.0, 2.0, 4.0, -3.5, -1.0}; + std::vector zero_point{-6, -4, -3, -1, 0, 2, 4, 7}; + std::vector x{-7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, -8}; + std::vector y_2{2, 0, 4, 0, 0, 3.5, 0, 1, 2, 4, 4, 8, -3.5, -7, 0, 15}; + std::vector y_3{2, 0, -2, 0, 0, 3.5, 7, 1, 2, 4, 6, 8, -3.5, -7, -10.5, 15}; + + DequantizeLinearOp21BlockedTest_Int4_Succeed({2, 2, 4}, 2, 2, x, x_scale, zero_point, y_2); + DequantizeLinearOp21BlockedTest_Int4_Succeed({2, 2, 4}, 2, 3, x, x_scale, zero_point, y_3); + DequantizeLinearOp21BlockedTest_Int_Succeed({2, 2, 4}, 2, 2, x, x_scale, zero_point, y_2); + DequantizeLinearOp21BlockedTest_Int_Succeed({2, 2, 4}, 2, 3, x, x_scale, zero_point, y_3); + DequantizeLinearOp21BlockedTest_Int_Succeed({2, 2, 4}, 2, 2, x, x_scale, zero_point, y_2); + DequantizeLinearOp21BlockedTest_Int_Succeed({2, 2, 4}, 2, 3, x, x_scale, zero_point, y_3); + DequantizeLinearOp21BlockedTest_Int4_Succeed({2, 2, 4}, 2, 2, x, x_scale, zero_point, y_2); + DequantizeLinearOp21BlockedTest_Int4_Succeed({2, 2, 4}, 2, 3, x, x_scale, zero_point, y_3); + DequantizeLinearOp21BlockedTest_Int_Succeed({2, 2, 4}, 2, 2, x, x_scale, zero_point, y_2); + DequantizeLinearOp21BlockedTest_Int_Succeed({2, 2, 4}, 2, 3, x, x_scale, zero_point, y_3); + DequantizeLinearOp21BlockedTest_Int_Succeed({2, 2, 4}, 2, 2, x, x_scale, zero_point, y_2); + DequantizeLinearOp21BlockedTest_Int_Succeed({2, 2, 4}, 2, 3, x, x_scale, zero_point, y_3); +} + +TEST(DequantizeLinearOp21BlockedTest, UnsignedInt_NoZeroPoint_FirstAxis) { + std::vector x_scale{-2.0, -4.0, 3.5, 1.0, 2.0, 4.0, -3.5, -1.0}; + std::vector zero_point{}; + std::vector x{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; + std::vector y_2{0, -4, 7, 3, -8, -20, 21, 7, 16, 36, -35, -11, 24, 52, -49, -15}; + std::vector y_3{0, -4, 7, 3, -8, -20, 21, 7, -16, -36, 35, 11, 24, 52, -49, -15}; + + DequantizeLinearOp21BlockedTest_Int4_Succeed({4, 2, 2}, 0, 2, x, x_scale, zero_point, y_2); + DequantizeLinearOp21BlockedTest_Int4_Succeed({4, 2, 2}, 0, 3, x, x_scale, zero_point, y_3); + DequantizeLinearOp21BlockedTest_Int_Succeed({4, 2, 2}, 0, 2, x, x_scale, zero_point, y_2); + DequantizeLinearOp21BlockedTest_Int_Succeed({4, 2, 2}, 0, 3, x, x_scale, zero_point, y_3); + DequantizeLinearOp21BlockedTest_Int_Succeed({4, 2, 2}, 0, 2, x, x_scale, zero_point, y_2); + DequantizeLinearOp21BlockedTest_Int_Succeed({4, 2, 2}, 0, 3, x, x_scale, zero_point, y_3); + DequantizeLinearOp21BlockedTest_Int4_Succeed({4, 2, 2}, 0, 2, x, x_scale, zero_point, y_2); + DequantizeLinearOp21BlockedTest_Int4_Succeed({4, 2, 2}, 0, 3, x, x_scale, zero_point, y_3); + DequantizeLinearOp21BlockedTest_Int_Succeed({4, 2, 2}, 0, 2, x, x_scale, zero_point, y_2); + DequantizeLinearOp21BlockedTest_Int_Succeed({4, 2, 2}, 0, 3, x, x_scale, zero_point, y_3); + DequantizeLinearOp21BlockedTest_Int_Succeed({4, 2, 2}, 0, 2, x, x_scale, zero_point, y_2); + DequantizeLinearOp21BlockedTest_Int_Succeed({4, 2, 2}, 0, 3, x, x_scale, zero_point, y_3); +} + +TEST(DequantizeLinearOp21BlockedTest, UnsignedInt_UseZeroPoint_FirstAxis) { + std::vector x_scale{-2.0, -4.0, 3.5, 1.0, 2.0, 4.0, -3.5, -1.0}; + std::vector zero_point{2, 0, 1, 9, 13, 5, 11, 6}; + std::vector x{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; + std::vector y_2{4, -4, 3.5, -6, -4, -20, 17.5, -2, -10, 16, 3.5, -5, -2, 32, -10.5, -9}; + std::vector y_3{4, -4, 3.5, -6, -4, -20, 17.5, -2, -12, -36, 31.5, 2, -2, 32, -10.5, -9}; + + DequantizeLinearOp21BlockedTest_Int4_Succeed({4, 2, 2}, 0, 2, x, x_scale, zero_point, y_2); + DequantizeLinearOp21BlockedTest_Int4_Succeed({4, 2, 2}, 0, 3, x, x_scale, zero_point, y_3); + DequantizeLinearOp21BlockedTest_Int_Succeed({4, 2, 2}, 0, 2, x, x_scale, zero_point, y_2); + DequantizeLinearOp21BlockedTest_Int_Succeed({4, 2, 2}, 0, 3, x, x_scale, zero_point, y_3); + DequantizeLinearOp21BlockedTest_Int_Succeed({4, 2, 2}, 0, 2, x, x_scale, zero_point, y_2); + DequantizeLinearOp21BlockedTest_Int_Succeed({4, 2, 2}, 0, 3, x, x_scale, zero_point, y_3); + DequantizeLinearOp21BlockedTest_Int4_Succeed({4, 2, 2}, 0, 2, x, x_scale, zero_point, y_2); + DequantizeLinearOp21BlockedTest_Int4_Succeed({4, 2, 2}, 0, 3, x, x_scale, zero_point, y_3); + DequantizeLinearOp21BlockedTest_Int_Succeed({4, 2, 2}, 0, 2, x, x_scale, zero_point, y_2); + DequantizeLinearOp21BlockedTest_Int_Succeed({4, 2, 2}, 0, 3, x, x_scale, zero_point, y_3); + DequantizeLinearOp21BlockedTest_Int_Succeed({4, 2, 2}, 0, 2, x, x_scale, zero_point, y_2); + DequantizeLinearOp21BlockedTest_Int_Succeed({4, 2, 2}, 0, 3, x, x_scale, zero_point, y_3); +} + +TEST(DequantizeLinearOp21BlockedTest, UnsignedInt_NoZeroPoint_MiddleAxis) { + std::vector x_scale{-2.0, -4.0, 3.5, 1.0, 2.0, 4.0, -3.5, -1.0}; + std::vector zero_point{}; + std::vector x{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; + std::vector y_2{0, -4, -4, -12, 14, 5, 21, 7, 16, 36, 20, 44, -42, -13, -49, -15}; + std::vector y_3{0, -4, -4, -12, -8, -20, 21, 7, 16, 36, 20, 44, 24, 52, -49, -15}; + + DequantizeLinearOp21BlockedTest_Int4_Succeed({2, 4, 2}, 1, 2, x, x_scale, zero_point, y_2); + DequantizeLinearOp21BlockedTest_Int4_Succeed({2, 4, 2}, 1, 3, x, x_scale, zero_point, y_3); + DequantizeLinearOp21BlockedTest_Int_Succeed({2, 4, 2}, 1, 2, x, x_scale, zero_point, y_2); + DequantizeLinearOp21BlockedTest_Int_Succeed({2, 4, 2}, 1, 3, x, x_scale, zero_point, y_3); + DequantizeLinearOp21BlockedTest_Int_Succeed({2, 4, 2}, 1, 2, x, x_scale, zero_point, y_2); + DequantizeLinearOp21BlockedTest_Int_Succeed({2, 4, 2}, 1, 3, x, x_scale, zero_point, y_3); + DequantizeLinearOp21BlockedTest_Int4_Succeed({2, 4, 2}, 1, 2, x, x_scale, zero_point, y_2); + DequantizeLinearOp21BlockedTest_Int4_Succeed({2, 4, 2}, 1, 3, x, x_scale, zero_point, y_3); + DequantizeLinearOp21BlockedTest_Int_Succeed({2, 4, 2}, 1, 2, x, x_scale, zero_point, y_2); + DequantizeLinearOp21BlockedTest_Int_Succeed({2, 4, 2}, 1, 3, x, x_scale, zero_point, y_3); + DequantizeLinearOp21BlockedTest_Int_Succeed({2, 4, 2}, 1, 2, x, x_scale, zero_point, y_2); + DequantizeLinearOp21BlockedTest_Int_Succeed({2, 4, 2}, 1, 3, x, x_scale, zero_point, y_3); +} + +TEST(DequantizeLinearOp21BlockedTest, UnsignedInt_UseZeroPoint_MiddleAxis) { + std::vector x_scale{-2.0, -4.0, 3.5, 1.0, 2.0, 4.0, -3.5, -1.0}; + std::vector zero_point{2, 0, 1, 9, 13, 5, 11, 6}; + std::vector x{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; + std::vector y_2{4, -4, 0, -12, 10.5, -4, 17.5, -2, -10, 16, -6, 24, -3.5, -7, -10.5, -9}; + std::vector y_3{4, -4, 0, -12, -4, -20, 17.5, -2, -10, 16, -6, 24, -2, 32, -10.5, -9}; + + DequantizeLinearOp21BlockedTest_Int4_Succeed({2, 4, 2}, 1, 2, x, x_scale, zero_point, y_2); + DequantizeLinearOp21BlockedTest_Int4_Succeed({2, 4, 2}, 1, 3, x, x_scale, zero_point, y_3); + DequantizeLinearOp21BlockedTest_Int_Succeed({2, 4, 2}, 1, 2, x, x_scale, zero_point, y_2); + DequantizeLinearOp21BlockedTest_Int_Succeed({2, 4, 2}, 1, 3, x, x_scale, zero_point, y_3); + DequantizeLinearOp21BlockedTest_Int_Succeed({2, 4, 2}, 1, 2, x, x_scale, zero_point, y_2); + DequantizeLinearOp21BlockedTest_Int_Succeed({2, 4, 2}, 1, 3, x, x_scale, zero_point, y_3); + DequantizeLinearOp21BlockedTest_Int4_Succeed({2, 4, 2}, 1, 2, x, x_scale, zero_point, y_2); + DequantizeLinearOp21BlockedTest_Int4_Succeed({2, 4, 2}, 1, 3, x, x_scale, zero_point, y_3); + DequantizeLinearOp21BlockedTest_Int_Succeed({2, 4, 2}, 1, 2, x, x_scale, zero_point, y_2); + DequantizeLinearOp21BlockedTest_Int_Succeed({2, 4, 2}, 1, 3, x, x_scale, zero_point, y_3); + DequantizeLinearOp21BlockedTest_Int_Succeed({2, 4, 2}, 1, 2, x, x_scale, zero_point, y_2); + DequantizeLinearOp21BlockedTest_Int_Succeed({2, 4, 2}, 1, 3, x, x_scale, zero_point, y_3); +} + +TEST(DequantizeLinearOp21BlockedTest, UnsignedInt_NoZeroPoint_LastAxis) { + std::vector x_scale{-2.0, -4.0, 3.5, 1.0, 2.0, 4.0, -3.5, -1.0}; + std::vector zero_point{}; + std::vector x{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; + std::vector y_2{0, -2, -8, -12, 14, 17.5, 6, 7, 16, 18, 40, 44, -42, -45.5, -14, -15}; + std::vector y_3{0, -2, -4, -12, 14, 17.5, 21, 7, 16, 18, 20, 44, -42, -45.5, -49, -15}; + + DequantizeLinearOp21BlockedTest_Int4_Succeed({2, 2, 4}, 2, 2, x, x_scale, zero_point, y_2); + DequantizeLinearOp21BlockedTest_Int4_Succeed({2, 2, 4}, 2, 3, x, x_scale, zero_point, y_3); + DequantizeLinearOp21BlockedTest_Int_Succeed({2, 2, 4}, 2, 2, x, x_scale, zero_point, y_2); + DequantizeLinearOp21BlockedTest_Int_Succeed({2, 2, 4}, 2, 3, x, x_scale, zero_point, y_3); + DequantizeLinearOp21BlockedTest_Int_Succeed({2, 2, 4}, 2, 2, x, x_scale, zero_point, y_2); + DequantizeLinearOp21BlockedTest_Int_Succeed({2, 2, 4}, 2, 3, x, x_scale, zero_point, y_3); + DequantizeLinearOp21BlockedTest_Int4_Succeed({2, 2, 4}, 2, 2, x, x_scale, zero_point, y_2); + DequantizeLinearOp21BlockedTest_Int4_Succeed({2, 2, 4}, 2, 3, x, x_scale, zero_point, y_3); + DequantizeLinearOp21BlockedTest_Int_Succeed({2, 2, 4}, 2, 2, x, x_scale, zero_point, y_2); + DequantizeLinearOp21BlockedTest_Int_Succeed({2, 2, 4}, 2, 3, x, x_scale, zero_point, y_3); + DequantizeLinearOp21BlockedTest_Int_Succeed({2, 2, 4}, 2, 2, x, x_scale, zero_point, y_2); + DequantizeLinearOp21BlockedTest_Int_Succeed({2, 2, 4}, 2, 3, x, x_scale, zero_point, y_3); +} + +TEST(DequantizeLinearOp21BlockedTest, UnsignedInt_UseZeroPoint_LastAxis) { + std::vector x_scale{-2.0, -4.0, 3.5, 1.0, 2.0, 4.0, -3.5, -1.0}; + std::vector zero_point{2, 0, 1, 9, 13, 5, 11, 6}; + std::vector x{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; + std::vector y_2{4, 2, -8, -12, 10.5, 14, -3, -2, -10, -8, 20, 24, -3.5, -7, -8, -9}; + std::vector y_3{4, 2, 0, -12, 10.5, 14, 17.5, -2, -10, -8, -6, 24, -3.5, -7, -10.5, -9}; + + DequantizeLinearOp21BlockedTest_Int4_Succeed({2, 2, 4}, 2, 2, x, x_scale, zero_point, y_2); + DequantizeLinearOp21BlockedTest_Int4_Succeed({2, 2, 4}, 2, 3, x, x_scale, zero_point, y_3); + DequantizeLinearOp21BlockedTest_Int_Succeed({2, 2, 4}, 2, 2, x, x_scale, zero_point, y_2); + DequantizeLinearOp21BlockedTest_Int_Succeed({2, 2, 4}, 2, 3, x, x_scale, zero_point, y_3); + DequantizeLinearOp21BlockedTest_Int_Succeed({2, 2, 4}, 2, 2, x, x_scale, zero_point, y_2); + DequantizeLinearOp21BlockedTest_Int_Succeed({2, 2, 4}, 2, 3, x, x_scale, zero_point, y_3); + DequantizeLinearOp21BlockedTest_Int4_Succeed({2, 2, 4}, 2, 2, x, x_scale, zero_point, y_2); + DequantizeLinearOp21BlockedTest_Int4_Succeed({2, 2, 4}, 2, 3, x, x_scale, zero_point, y_3); + DequantizeLinearOp21BlockedTest_Int_Succeed({2, 2, 4}, 2, 2, x, x_scale, zero_point, y_2); + DequantizeLinearOp21BlockedTest_Int_Succeed({2, 2, 4}, 2, 3, x, x_scale, zero_point, y_3); + DequantizeLinearOp21BlockedTest_Int_Succeed({2, 2, 4}, 2, 2, x, x_scale, zero_point, y_2); + DequantizeLinearOp21BlockedTest_Int_Succeed({2, 2, 4}, 2, 3, x, x_scale, zero_point, y_3); +} + +#if !defined(DISABLE_FLOAT8_TYPES) +TEST(DequantizeLinearOp21BlockedTest, Float8_NoZeroPoint_FirstAxis) { + constexpr int min_cuda_architecture = 11080; + bool enable_cuda = (nullptr != DefaultCpuExecutionProvider().get()) && HasCudaEnvironment(min_cuda_architecture); + bool enable_cpu = (nullptr != DefaultCpuExecutionProvider().get()); + + std::vector x_scale{-2.0, -4.0, 3.5, 1.0, 2.0, 4.0, -3.5, -1.0}; + std::vector zero_point{}; + std::vector x{-7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8}; + std::vector y_2{14.0, 24.0, -17.5, -4.0, 6.0, 8.0, -3.5, 0.0, 2.0, 8.0, -10.5, -4.0, 10.0, 24.0, -24.5, -8.0}; + std::vector y_3{14.0, 24.0, -17.5, -4.0, 6.0, 8.0, -3.5, 0.0, -2.0, -8.0, 10.5, 4.0, 10.0, 24.0, -24.5, -8.0}; + + if (enable_cpu || enable_cuda) { + DequantizeLinearOp21BlockedTest_Float8_Succeed({4, 2, 2}, 0, 2, x, x_scale, zero_point, y_2); + DequantizeLinearOp21BlockedTest_Float8_Succeed( + {4, 2, 2}, 0, 2, x, x_scale, zero_point, y_2); + DequantizeLinearOp21BlockedTest_Float8_Succeed({4, 2, 2}, 0, 2, x, x_scale, zero_point, y_2); + DequantizeLinearOp21BlockedTest_Float8_Succeed({4, 2, 2}, 0, 2, x, x_scale, zero_point, y_2); + DequantizeLinearOp21BlockedTest_Float8_Succeed({4, 2, 2}, 0, 3, x, x_scale, zero_point, y_3); + DequantizeLinearOp21BlockedTest_Float8_Succeed( + {4, 2, 2}, 0, 3, x, x_scale, zero_point, y_3); + DequantizeLinearOp21BlockedTest_Float8_Succeed({4, 2, 2}, 0, 3, x, x_scale, zero_point, y_3); + DequantizeLinearOp21BlockedTest_Float8_Succeed({4, 2, 2}, 0, 3, x, x_scale, zero_point, y_3); + } + if (enable_cpu) { + DequantizeLinearOp21BlockedTest_Float8_Succeed({4, 2, 2}, 0, 2, x, x_scale, zero_point, y_2); + DequantizeLinearOp21BlockedTest_Float8_Succeed( + {4, 2, 2}, 0, 2, x, x_scale, zero_point, y_2); + DequantizeLinearOp21BlockedTest_Float8_Succeed({4, 2, 2}, 0, 2, x, x_scale, zero_point, y_2); + DequantizeLinearOp21BlockedTest_Float8_Succeed( + {4, 2, 2}, 0, 2, x, x_scale, zero_point, y_2); + DequantizeLinearOp21BlockedTest_Float8_Succeed({4, 2, 2}, 0, 3, x, x_scale, zero_point, y_3); + DequantizeLinearOp21BlockedTest_Float8_Succeed( + {4, 2, 2}, 0, 3, x, x_scale, zero_point, y_3); + DequantizeLinearOp21BlockedTest_Float8_Succeed({4, 2, 2}, 0, 3, x, x_scale, zero_point, y_3); + DequantizeLinearOp21BlockedTest_Float8_Succeed( + {4, 2, 2}, 0, 3, x, x_scale, zero_point, y_3); + } +} + +TEST(DequantizeLinearOp21BlockedTest, Float8_NoZeroPoint_MiddleAxis) { + constexpr int min_cuda_architecture = 11080; + bool enable_cuda = (nullptr != DefaultCpuExecutionProvider().get()) && HasCudaEnvironment(min_cuda_architecture); + bool enable_cpu = (nullptr != DefaultCpuExecutionProvider().get()); + + std::vector zero_point{}; + std::vector x_scale{-2.0, -4.0, 3.5, 1.0, 2.0, 4.0, -3.5, -1.0}; + std::vector x{-7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8}; + std::vector y_2{14, 24, 10, 16, -10.5, -2, -3.5, 0, 2, 8, 6, 16, -17.5, -6, -24.5, -8}; + std::vector y_3{14, 24, 10, 16, 6, 8, -3.5, 0, 2, 8, 6, 16, 10, 24, -24.5, -8}; + + if (enable_cpu || enable_cuda) { + DequantizeLinearOp21BlockedTest_Float8_Succeed({2, 4, 2}, 1, 2, x, x_scale, zero_point, y_2); + DequantizeLinearOp21BlockedTest_Float8_Succeed( + {2, 4, 2}, 1, 2, x, x_scale, zero_point, y_2); + DequantizeLinearOp21BlockedTest_Float8_Succeed({2, 4, 2}, 1, 2, x, x_scale, zero_point, y_2); + DequantizeLinearOp21BlockedTest_Float8_Succeed({2, 4, 2}, 1, 2, x, x_scale, zero_point, y_2); + DequantizeLinearOp21BlockedTest_Float8_Succeed({2, 4, 2}, 1, 3, x, x_scale, zero_point, y_3); + DequantizeLinearOp21BlockedTest_Float8_Succeed( + {2, 4, 2}, 1, 3, x, x_scale, zero_point, y_3); + DequantizeLinearOp21BlockedTest_Float8_Succeed({2, 4, 2}, 1, 3, x, x_scale, zero_point, y_3); + DequantizeLinearOp21BlockedTest_Float8_Succeed({2, 4, 2}, 1, 3, x, x_scale, zero_point, y_3); + } + if (enable_cpu) { + DequantizeLinearOp21BlockedTest_Float8_Succeed({2, 4, 2}, 1, 2, x, x_scale, zero_point, y_2); + DequantizeLinearOp21BlockedTest_Float8_Succeed( + {2, 4, 2}, 1, 2, x, x_scale, zero_point, y_2); + DequantizeLinearOp21BlockedTest_Float8_Succeed({2, 4, 2}, 1, 2, x, x_scale, zero_point, y_2); + DequantizeLinearOp21BlockedTest_Float8_Succeed( + {2, 4, 2}, 1, 2, x, x_scale, zero_point, y_2); + DequantizeLinearOp21BlockedTest_Float8_Succeed({2, 4, 2}, 1, 3, x, x_scale, zero_point, y_3); + DequantizeLinearOp21BlockedTest_Float8_Succeed( + {2, 4, 2}, 1, 3, x, x_scale, zero_point, y_3); + DequantizeLinearOp21BlockedTest_Float8_Succeed({2, 4, 2}, 1, 3, x, x_scale, zero_point, y_3); + DequantizeLinearOp21BlockedTest_Float8_Succeed( + {2, 4, 2}, 1, 3, x, x_scale, zero_point, y_3); + } +} + +TEST(DequantizeLinearOp21BlockedTest, Float8_NoZeroPoint_LastAxis) { + constexpr int min_cuda_architecture = 11080; + bool enable_cuda = (nullptr != DefaultCpuExecutionProvider().get()) && HasCudaEnvironment(min_cuda_architecture); + bool enable_cpu = (nullptr != DefaultCpuExecutionProvider().get()); + + std::vector zero_point{}; + std::vector x_scale{-2.0, -4.0, 3.5, 1.0, 2.0, 4.0, -3.5, -1.0}; + std::vector x{-7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8}; + std::vector y_2{14, 12, 20, 16, -10.5, -7, -1, 0, 2, 4, 12, 16, -17.5, -21, -7, -8}; + std::vector y_3{14, 12, 10, 16, -10.5, -7, -3.5, 0, 2, 4, 6, 16, -17.5, -21, -24.5, -8}; + + if (enable_cpu || enable_cuda) { + DequantizeLinearOp21BlockedTest_Float8_Succeed({2, 2, 4}, 2, 2, x, x_scale, zero_point, y_2); + DequantizeLinearOp21BlockedTest_Float8_Succeed( + {2, 2, 4}, 2, 2, x, x_scale, zero_point, y_2); + DequantizeLinearOp21BlockedTest_Float8_Succeed({2, 2, 4}, 2, 2, x, x_scale, zero_point, y_2); + DequantizeLinearOp21BlockedTest_Float8_Succeed({2, 2, 4}, 2, 2, x, x_scale, zero_point, y_2); + DequantizeLinearOp21BlockedTest_Float8_Succeed({2, 2, 4}, 2, 3, x, x_scale, zero_point, y_3); + DequantizeLinearOp21BlockedTest_Float8_Succeed( + {2, 2, 4}, 2, 3, x, x_scale, zero_point, y_3); + DequantizeLinearOp21BlockedTest_Float8_Succeed({2, 2, 4}, 2, 3, x, x_scale, zero_point, y_3); + DequantizeLinearOp21BlockedTest_Float8_Succeed({2, 2, 4}, 2, 3, x, x_scale, zero_point, y_3); + } + if (enable_cpu) { + DequantizeLinearOp21BlockedTest_Float8_Succeed({2, 2, 4}, 2, 2, x, x_scale, zero_point, y_2); + DequantizeLinearOp21BlockedTest_Float8_Succeed( + {2, 2, 4}, 2, 2, x, x_scale, zero_point, y_2); + DequantizeLinearOp21BlockedTest_Float8_Succeed({2, 2, 4}, 2, 2, x, x_scale, zero_point, y_2); + DequantizeLinearOp21BlockedTest_Float8_Succeed( + {2, 2, 4}, 2, 2, x, x_scale, zero_point, y_2); + DequantizeLinearOp21BlockedTest_Float8_Succeed({2, 2, 4}, 2, 3, x, x_scale, zero_point, y_3); + DequantizeLinearOp21BlockedTest_Float8_Succeed( + {2, 2, 4}, 2, 3, x, x_scale, zero_point, y_3); + DequantizeLinearOp21BlockedTest_Float8_Succeed({2, 2, 4}, 2, 3, x, x_scale, zero_point, y_3); + DequantizeLinearOp21BlockedTest_Float8_Succeed( + {2, 2, 4}, 2, 3, x, x_scale, zero_point, y_3); + } +} +#endif +} // namespace blocked_dequantization + } // namespace test } // namespace onnxruntime diff --git a/onnxruntime/test/providers/cpu/tensor/trilu_op_test.cc b/onnxruntime/test/providers/cpu/tensor/trilu_op_test.cc index f0b5d6afa9c7b..f1d1d94343e6f 100644 --- a/onnxruntime/test/providers/cpu/tensor/trilu_op_test.cc +++ b/onnxruntime/test/providers/cpu/tensor/trilu_op_test.cc @@ -62,63 +62,54 @@ TEST(TriluOpTest, two_by_two_long_lower) { test.Run(); } +TEST(TriluOpTest, two_by_two_bool_upper) { + OpTester test("Trilu", 14, kOnnxDomain); + int64_t up = 1; + test.AddAttribute("upper", up); + test.AddInput("X", {2, 2}, + {true, true, + true, true}); + test.AddOutput("Y", {2, 2}, + {true, true, + false, true}); + test.Run(); +} + +TEST(TriluOpTest, three_by_three_bool_lower) { + OpTester test("Trilu", 14, kOnnxDomain); + int64_t up = 0; + test.AddAttribute("upper", up); + test.AddInput("X", {3, 3}, + // include a couple of false values to check they are copied + {true, true, true, + true, false, true, + true, true, false}); + test.AddOutput("Y", {3, 3}, + {true, false, false, + true, false, false, + true, true, false}); + test.Run(); +} + TEST(TriluOpTest, three_dim_float_upper) { OpTester test("Trilu", 14, kOnnxDomain); test.AddInput("X", {2, 3, 4}, - { - 4.f, - 1.f, - 5.f, - 8.f, - 4.f, - 3.f, - 2.f, - 4.f, - 6.f, - 1.f, - 2.f, - 3.f, - 1.f, - 6.f, - 2.f, - 1.f, - 4.f, - 1.f, - 5.f, - 8.f, - 4.f, - 3.f, - 2.f, - 4.f, - }); + {4.f, 1.f, 5.f, 8.f, + 4.f, 3.f, 2.f, 4.f, + 6.f, 1.f, 2.f, 3.f, + + 1.f, 6.f, 2.f, 1.f, + 4.f, 1.f, 5.f, 8.f, + 4.f, 3.f, 2.f, 4.f}); test.AddInput("k", {1}, {1}); test.AddOutput("Y", {2, 3, 4}, - { - 0.f, - 1.f, - 5.f, - 8.f, - 0.f, - 0.f, - 2.f, - 4.f, - 0.f, - 0.f, - 0.f, - 3.f, - 0.f, - 6.f, - 2.f, - 1.f, - 0.f, - 0.f, - 5.f, - 8.f, - 0.f, - 0.f, - 0.f, - 4.f, - }); + {0.f, 1.f, 5.f, 8.f, + 0.f, 0.f, 2.f, 4.f, + 0.f, 0.f, 0.f, 3.f, + + 0.f, 6.f, 2.f, 1.f, + 0.f, 0.f, 5.f, 8.f, + 0.f, 0.f, 0.f, 4.f}); test.Run(); } @@ -127,60 +118,22 @@ TEST(TriluOpTest, three_dim_float_lower) { int64_t up = 0; test.AddAttribute("upper", up); test.AddInput("X", {2, 3, 4}, - { - 4.f, - 1.f, - 5.f, - 8.f, - 4.f, - 3.f, - 2.f, - 4.f, - 6.f, - 1.f, - 2.f, - 3.f, - 1.f, - 6.f, - 2.f, - 1.f, - 4.f, - 1.f, - 5.f, - 8.f, - 4.f, - 3.f, - 2.f, - 4.f, - }); + {4.f, 1.f, 5.f, 8.f, + 4.f, 3.f, 2.f, 4.f, + 6.f, 1.f, 2.f, 3.f, + + 1.f, 6.f, 2.f, 1.f, + 4.f, 1.f, 5.f, 8.f, + 4.f, 3.f, 2.f, 4.f}); test.AddInput("k", {1}, {1}); test.AddOutput("Y", {2, 3, 4}, - { - 4.f, - 1.f, - 0.f, - 0.f, - 4.f, - 3.f, - 2.f, - 0.f, - 6.f, - 1.f, - 2.f, - 3.f, - 1.f, - 6.f, - 0.f, - 0.f, - 4.f, - 1.f, - 5.f, - 0.f, - 4.f, - 3.f, - 2.f, - 4.f, - }); + {4.f, 1.f, 0.f, 0.f, + 4.f, 3.f, 2.f, 0.f, + 6.f, 1.f, 2.f, 3.f, + + 1.f, 6.f, 0.f, 0.f, + 4.f, 1.f, 5.f, 0.f, + 4.f, 3.f, 2.f, 4.f}); test.Run(); } @@ -189,60 +142,22 @@ TEST(TriluOpTest, neg_k_float_upper) { int64_t up = 1; test.AddAttribute("upper", up); test.AddInput("X", {2, 3, 4}, - { - 4.f, - 1.f, - 5.f, - 8.f, - 4.f, - 3.f, - 2.f, - 4.f, - 6.f, - 1.f, - 2.f, - 3.f, - 1.f, - 6.f, - 2.f, - 1.f, - 4.f, - 1.f, - 5.f, - 8.f, - 4.f, - 3.f, - 2.f, - 4.f, - }); + {4.f, 1.f, 5.f, 8.f, + 4.f, 3.f, 2.f, 4.f, + 6.f, 1.f, 2.f, 3.f, + + 1.f, 6.f, 2.f, 1.f, + 4.f, 1.f, 5.f, 8.f, + 4.f, 3.f, 2.f, 4.f}); test.AddInput("k", {1}, {-1}); test.AddOutput("Y", {2, 3, 4}, - { - 4.f, - 1.f, - 5.f, - 8.f, - 4.f, - 3.f, - 2.f, - 4.f, - 0.f, - 1.f, - 2.f, - 3.f, - 1.f, - 6.f, - 2.f, - 1.f, - 4.f, - 1.f, - 5.f, - 8.f, - 0.f, - 3.f, - 2.f, - 4.f, - }); + {4.f, 1.f, 5.f, 8.f, + 4.f, 3.f, 2.f, 4.f, + 0.f, 1.f, 2.f, 3.f, + + 1.f, 6.f, 2.f, 1.f, + 4.f, 1.f, 5.f, 8.f, + 0.f, 3.f, 2.f, 4.f}); test.Run(); } @@ -251,120 +166,44 @@ TEST(TriluOpTest, neg_k_float_lower) { int64_t up = 0; test.AddAttribute("upper", up); test.AddInput("X", {2, 3, 4}, - { - 4.f, - 1.f, - 5.f, - 8.f, - 4.f, - 3.f, - 2.f, - 4.f, - 6.f, - 1.f, - 2.f, - 3.f, - 1.f, - 6.f, - 2.f, - 1.f, - 4.f, - 1.f, - 5.f, - 8.f, - 4.f, - 3.f, - 2.f, - 4.f, - }); + {4.f, 1.f, 5.f, 8.f, + 4.f, 3.f, 2.f, 4.f, + 6.f, 1.f, 2.f, 3.f, + + 1.f, 6.f, 2.f, 1.f, + 4.f, 1.f, 5.f, 8.f, + 4.f, 3.f, 2.f, 4.f}); test.AddInput("k", {1}, {-1}); test.AddOutput("Y", {2, 3, 4}, - { - 0.f, - 0.f, - 0.f, - 0.f, - 4.f, - 0.f, - 0.f, - 0.f, - 6.f, - 1.f, - 0.f, - 0.f, - 0.f, - 0.f, - 0.f, - 0.f, - 4.f, - 0.f, - 0.f, - 0.f, - 4.f, - 3.f, - 0.f, - 0.f, - }); + {0.f, 0.f, 0.f, 0.f, + 4.f, 0.f, 0.f, 0.f, + 6.f, 1.f, 0.f, 0.f, + + 0.f, 0.f, 0.f, 0.f, + 4.f, 0.f, 0.f, 0.f, + 4.f, 3.f, 0.f, 0.f}); test.Run(); } TEST(TriluTest, small_k_float_upper) { OpTester test("Trilu", 14, kOnnxDomain); test.AddInput("X", {2, 3, 4}, - { - 4.f, - 1.f, - 5.f, - 8.f, - 4.f, - 3.f, - 2.f, - 4.f, - 6.f, - 1.f, - 2.f, - 3.f, - 1.f, - 6.f, - 2.f, - 1.f, - 4.f, - 1.f, - 5.f, - 8.f, - 4.f, - 3.f, - 2.f, - 4.f, - }); + {4.f, 1.f, 5.f, 8.f, + 4.f, 3.f, 2.f, 4.f, + 6.f, 1.f, 2.f, 3.f, + + 1.f, 6.f, 2.f, 1.f, + 4.f, 1.f, 5.f, 8.f, + 4.f, 3.f, 2.f, 4.f}); test.AddInput("k", {1}, {-5}); test.AddOutput("Y", {2, 3, 4}, - { - 4.f, - 1.f, - 5.f, - 8.f, - 4.f, - 3.f, - 2.f, - 4.f, - 6.f, - 1.f, - 2.f, - 3.f, - 1.f, - 6.f, - 2.f, - 1.f, - 4.f, - 1.f, - 5.f, - 8.f, - 4.f, - 3.f, - 2.f, - 4.f, - }); + {4.f, 1.f, 5.f, 8.f, + 4.f, 3.f, 2.f, 4.f, + 6.f, 1.f, 2.f, 3.f, + + 1.f, 6.f, 2.f, 1.f, + 4.f, 1.f, 5.f, 8.f, + 4.f, 3.f, 2.f, 4.f}); test.Run(); } @@ -373,60 +212,22 @@ TEST(TriluOpTest, small_k_float_lower) { int64_t up = 0; test.AddAttribute("upper", up); test.AddInput("X", {2, 3, 4}, - { - 4.f, - 1.f, - 5.f, - 8.f, - 4.f, - 3.f, - 2.f, - 4.f, - 6.f, - 1.f, - 2.f, - 3.f, - 1.f, - 6.f, - 2.f, - 1.f, - 4.f, - 1.f, - 5.f, - 8.f, - 4.f, - 3.f, - 2.f, - 4.f, - }); + {4.f, 1.f, 5.f, 8.f, + 4.f, 3.f, 2.f, 4.f, + 6.f, 1.f, 2.f, 3.f, + + 1.f, 6.f, 2.f, 1.f, + 4.f, 1.f, 5.f, 8.f, + 4.f, 3.f, 2.f, 4.f}); test.AddInput("k", {1}, {-5}); test.AddOutput("Y", {2, 3, 4}, - { - 0.f, - 0.f, - 0.f, - 0.f, - 0.f, - 0.f, - 0.f, - 0.f, - 0.f, - 0.f, - 0.f, - 0.f, - 0.f, - 0.f, - 0.f, - 0.f, - 0.f, - 0.f, - 0.f, - 0.f, - 0.f, - 0.f, - 0.f, - 0.f, - }); + {0.f, 0.f, 0.f, 0.f, + 0.f, 0.f, 0.f, 0.f, + 0.f, 0.f, 0.f, 0.f, + + 0.f, 0.f, 0.f, 0.f, + 0.f, 0.f, 0.f, 0.f, + 0.f, 0.f, 0.f, 0.f}); test.Run(); } diff --git a/onnxruntime/test/providers/qnn/batch_norm_htp_test.cc b/onnxruntime/test/providers/qnn/batch_norm_htp_test.cc index 023a6078ff94d..036c5760ed560 100644 --- a/onnxruntime/test/providers/qnn/batch_norm_htp_test.cc +++ b/onnxruntime/test/providers/qnn/batch_norm_htp_test.cc @@ -158,7 +158,8 @@ GetTestQDQModelFn BuildQDQBatchNormTestCase(const TestInputDef& input_def, const TestInputDef& scale_def, const TestInputDef& bias_def, - ExpectedEPNodeAssignment expected_ep_assignment) { + ExpectedEPNodeAssignment expected_ep_assignment, + QDQTolerance tolerance = QDQTolerance()) { ProviderOptions provider_options; #if defined(_WIN32) provider_options["backend_path"] = "QnnHtp.dll"; @@ -171,7 +172,8 @@ static void RunBatchNormQDQTest(const TestInputDef& input_def, BuildQDQBatchNormTestCase(input_def, scale_def, bias_def), provider_options, 11, - expected_ep_assignment); + expected_ep_assignment, + tolerance); } static void RunBatchNormFP16Test(const TestInputDef& input_def, @@ -219,7 +221,9 @@ TEST_F(QnnHTPBackendTests, BatchNorm2D) { RunBatchNormQDQTest(TestInputDef({2, num_channels, 2, 2}, false, input_data), // Input data TestInputDef({num_channels}, true, {1.0f, 2.0f}), // Scale initializer TestInputDef({num_channels}, true, {1.1f, 2.1f}), // Bias initializer - ExpectedEPNodeAssignment::All); + ExpectedEPNodeAssignment::All, + // Require a slightly increased tolerance on Windows ARM64 (from 0.4% to 0.6%). + QDQTolerance(0.006f)); } // Test FP16 BatchNormalization on the HTP backend. diff --git a/onnxruntime/test/providers/qnn/conv_test.cc b/onnxruntime/test/providers/qnn/conv_test.cc index a469cccbbd447..b88578a915204 100644 --- a/onnxruntime/test/providers/qnn/conv_test.cc +++ b/onnxruntime/test/providers/qnn/conv_test.cc @@ -1626,8 +1626,8 @@ TEST_F(QnnHTPBackendTests, ConvU8U8S32_large_input1_padding_bias_initializer) { ExpectedEPNodeAssignment::All, false, // use_qdq_contrib_ops 13, // opset - // Need tolerance of 0.73% of output range after QNN SDK 2.17 - QDQTolerance(0.00730f)); + // Need tolerance of 0.76% of output range after QNN SDK 2.19.2 + QDQTolerance(0.0076f)); } TEST_F(QnnHTPBackendTests, ConvU8U8S32_large_input2_bias_initializer) { diff --git a/onnxruntime/test/providers/qnn/gemm_op_test.cc b/onnxruntime/test/providers/qnn/gemm_op_test.cc index 959d637753623..33c868694c9c0 100644 --- a/onnxruntime/test/providers/qnn/gemm_op_test.cc +++ b/onnxruntime/test/providers/qnn/gemm_op_test.cc @@ -285,7 +285,8 @@ TEST_F(QnnHTPBackendTests, Gemm_Broadcast_Bias_DynamicInputs) { ExpectedEPNodeAssignment::All, 13, false, - QDQTolerance(0.00410f)); + // Require tolerance of 0.74% on Windows ARM64. + QDQTolerance(0.0074f)); } TEST_F(QnnHTPBackendTests, Gemm_Broadcast_Bias_DynamicA_StaticB_DynamicC) { @@ -304,7 +305,8 @@ TEST_F(QnnHTPBackendTests, Gemm_Broadcast_Bias_DynamicA_StaticB_DynamicC) { ExpectedEPNodeAssignment::All, 13, false, - QDQTolerance(0.00410f)); + // Require tolerance of 0.74% on Windows ARM64. + QDQTolerance(0.0074f)); } TEST_F(QnnHTPBackendTests, Gemm_Broadcast_Bias_DynamicA_StaticB_StaticC) { @@ -323,7 +325,8 @@ TEST_F(QnnHTPBackendTests, Gemm_Broadcast_Bias_DynamicA_StaticB_StaticC) { ExpectedEPNodeAssignment::All, 13, false, - QDQTolerance(0.00410f)); + // Require tolerance of 0.74% on Windows ARM64. + QDQTolerance(0.0074f)); } // Test 16-bit QDQ Gemm with dynamic inputs A and Bias. The B input is an initializer. diff --git a/onnxruntime/test/providers/qnn/layer_norm_test.cc b/onnxruntime/test/providers/qnn/layer_norm_test.cc index 8cebdd813dacd..7d129dceca582 100644 --- a/onnxruntime/test/providers/qnn/layer_norm_test.cc +++ b/onnxruntime/test/providers/qnn/layer_norm_test.cc @@ -158,7 +158,20 @@ TEST_F(QnnHTPBackendTests, LayerNorm1D_LastAxis_StaticScale_AU16_WU8) { } // Test accuracy of 8-bit QDQ LayerNorm with a dynamic scale input. -TEST_F(QnnHTPBackendTests, LayerNorm1D_LastAxis_DynamicScale) { +// +// TODO(adrianlizarraga): Fails to finalize with QNN SDK 2.22. +// Verbose logs: +// Starting stage: Graph Transformations and Optimizations +// C:\...\QNN\HTP\HTP\src\hexagon\prepare\graph_prepare.cc:203:ERROR:could not create op: q::flat_to_vtcm +// C:\...\QNN\HTP\HTP\src\hexagon\prepare\graph_prepare.cc:1187:ERROR:Op 0x102800000013 preparation failed with err:-1 +// Completed stage: Graph Transformations and Optimizations (6247 us) +// QnnDsp "node_token_15" generated: could not create op +// QnnDsp RouterWindows graph prepare failed 12 +// QnnDsp Failed to finalize graph (id: 1) with err 1002 +// QnnDsp Wake up free backend 1 thread(s) +// QnnDsp QnnGraph_finalize done. status 0x3ea +// Failed to finalize QNN graph. +TEST_F(QnnHTPBackendTests, DISABLED_LayerNorm1D_LastAxis_DynamicScale) { RunLayerNormQDQTest(TestInputDef({1, 2, 3}, false, GetFloatDataInRange(0.0f, 10.0f, 6)), TestInputDef({3}, false, GetFloatDataInRange(0.0f, 1.0f, 3)), // Dynamic {utils::MakeAttribute("axis", static_cast(-1))}, // Last axis diff --git a/onnxruntime/test/providers/qnn/lrn_op_test.cc b/onnxruntime/test/providers/qnn/lrn_op_test.cc index 751db5049f6b9..a99cba66bf167 100644 --- a/onnxruntime/test/providers/qnn/lrn_op_test.cc +++ b/onnxruntime/test/providers/qnn/lrn_op_test.cc @@ -135,8 +135,8 @@ TEST_F(QnnHTPBackendTests, LRNSize3) { 0.75f, // beta 1.0f, // bias 13, // opset - // Need to use tolerance of 0.405% of output range after QNN SDK 2.17 - QDQTolerance(0.00405f)); + // Need to use tolerance of 0.8% of output range after QNN SDK 2.22 + QDQTolerance(0.008f)); } TEST_F(QnnHTPBackendTests, LRNSize5) { @@ -147,8 +147,8 @@ TEST_F(QnnHTPBackendTests, LRNSize5) { 0.75f, // beta 1.0f, // bias 13, // opset - // Need to use tolerance of 0.407% of output range after QNN SDK 2.17 - QDQTolerance(0.00407f)); + // Need to use tolerance of 0.8% of output range after QNN SDK 2.22 + QDQTolerance(0.008f)); } TEST_F(QnnHTPBackendTests, LRN_size_larger_than_channel) { diff --git a/onnxruntime/test/providers/qnn/matmul_test.cpp b/onnxruntime/test/providers/qnn/matmul_test.cpp index f26af7c79fdd9..dba60b1041696 100644 --- a/onnxruntime/test/providers/qnn/matmul_test.cpp +++ b/onnxruntime/test/providers/qnn/matmul_test.cpp @@ -103,7 +103,8 @@ static void RunQDQMatMulOpOpTest(const TestInputDef& input1_def, // CPU tests: // -TEST_F(QnnCPUBackendTests, MatMulOp) { +// TODO: Crashes during QNN CPU execution (QNN SDK 2.22) +TEST_F(QnnCPUBackendTests, DISABLED_MatMulOp) { RunMatMulOpOpTest(TestInputDef({2, 3}, false, {-10.0f, -4.0f, -2.0f, 0.0f, 5.0f, 10.0f}), TestInputDef({3, 2}, false, {-10.0f, -6.0f, -1.0f, 0.0f, 3.0f, 10.0f}), ExpectedEPNodeAssignment::All, 18); @@ -126,13 +127,8 @@ TEST_F(QnnCPUBackendTests, DISABLED_MatMulOp_Broadcast) { ExpectedEPNodeAssignment::All, 18, 0.0004f); } -#if defined(__linux__) +// TODO: Crashes during QNN CPU execution (QNN SDK 2.22) TEST_F(QnnCPUBackendTests, DISABLED_MatMulOp_PaddingAndBroadcast_BLargerThanA) { -#else -// TODO: When fixed, enable MathOpTest.MatMulFloatType from cpu/mat/matmul_test.cc -// QNN SDK 2.17: Accuracy errors -TEST_F(QnnCPUBackendTests, MatMulOp_PaddingAndBroadcast_BLargerThanA) { -#endif std::vector input0_shape = {2, 3, 2}; std::vector input1_shape = {3, 2, 2, 1}; RunMatMulOpOpTest(TestInputDef(input0_shape, false, GetSequentialFloatData(input0_shape)), diff --git a/onnxruntime/test/python/quantization/test_quant_util.py b/onnxruntime/test/python/quantization/test_quant_util.py index 848857ceb279d..7b3fc08982ac1 100644 --- a/onnxruntime/test/python/quantization/test_quant_util.py +++ b/onnxruntime/test/python/quantization/test_quant_util.py @@ -13,7 +13,13 @@ import onnx from onnx import TensorProto, helper, numpy_helper -from onnxruntime.quantization.quant_utils import compute_scale_zp, load_model_with_shape_infer, model_has_infer_metadata +from onnxruntime.quantization.quant_utils import ( + compute_scale_zp, + load_model_with_shape_infer, + model_has_infer_metadata, + pack_bytes_to_4bit, + quantize_data, +) class TestQuantUtil(unittest.TestCase): @@ -101,6 +107,67 @@ def test_load_external_model(self): model_reloaded = load_model_with_shape_infer(Path(model_file_path)) self.assertTrue(model_has_infer_metadata(model_reloaded)) + def test_pack_bytes_to_4bit(self): + """ + Tests the pack_bytes_to_4bit() utility. + """ + subtest_configs = [ + (-8, 6, True), # Odd num elems, signed + (-8, 7, True), # Even num elems, signed + (0, 14, False), # Odd num elems, unsigned + (0, 15, False), # Even num elems, unsigned + ] + for min_val, max_val, signed in subtest_configs: + with self.subTest(min_val=min_val, max_val=max_val, signed=signed): + src_float = numpy.arange(min_val, max_val + 1).astype(numpy.float32) + src_int = src_float.astype(numpy.int8 if signed else numpy.uint8) + + actual_packed_vals = bytes(pack_bytes_to_4bit(src_int.tobytes())) + expected_packed_vals = onnx.helper.pack_float32_to_4bit(src_float, signed).tobytes() + self.assertEqual(actual_packed_vals, expected_packed_vals) + + def test_quantize_data_4bit(self): + """ + Test that calling quantize_data for int4 quantization returns data of the correct type and range. + """ + data_float = numpy.arange(-20, 17).astype(numpy.float32) + + subtest_configs = [ + (onnx.TensorProto.INT4, True), # int4, symmetric quant + (onnx.TensorProto.INT4, False), # int4, symmetric quant + (onnx.TensorProto.UINT4, True), # uint4, symmetric quant + (onnx.TensorProto.UINT4, False), # uint4, symmetric quant + ] + + for onnx_type, symmetric in subtest_configs: + with self.subTest(onnx_type=onnx_type, symmetric=symmetric): + _, _, zero_point, scale, data_quant = quantize_data(data_float, onnx_type, symmetric) + is_signed = onnx_type == onnx.TensorProto.INT4 + np_int_type = numpy.int8 if is_signed else numpy.uint8 + qmin = numpy.array(-8 if is_signed else 0, dtype=np_int_type) + qmax = numpy.array(7 if is_signed else 15, dtype=np_int_type) + + self.assertEqual(zero_point.dtype, np_int_type) + self.assertEqual(scale.dtype, data_float.dtype) + + expected_zp, expected_scale = compute_scale_zp( + data_float.min(), data_float.max(), qmin, qmax, symmetric=symmetric + ) + self.assertEqual(zero_point, expected_zp) + self.assertEqual(scale, expected_scale) + + # Even int4 quantization generates 8-bit numpy values. + self.assertEqual(data_quant.dtype, np_int_type) + for index, actual_quant_val in enumerate(data_quant.flatten()): + self.assertTrue(actual_quant_val >= qmin and actual_quant_val <= qmax) + + expected_quant_val = numpy.asarray((data_float[index] / scale).round() + zero_point).astype( + np_int_type + ) + numpy.clip(expected_quant_val, qmin, qmax, out=expected_quant_val) + + self.assertEqual(numpy.array(actual_quant_val), expected_quant_val) + if __name__ == "__main__": unittest.main() diff --git a/onnxruntime/test/python/requirements.txt b/onnxruntime/test/python/requirements.txt index 5d8e356d0fc07..741c411ce55a0 100644 --- a/onnxruntime/test/python/requirements.txt +++ b/onnxruntime/test/python/requirements.txt @@ -1,2 +1,2 @@ -onnx==1.16.0 +onnx==1.16.1 pytest diff --git a/onnxruntime/test/python/transformers/test_gqa_cpu.py b/onnxruntime/test/python/transformers/test_gqa_cpu.py index 4df1ac1cc2b7e..b6b8aee15852f 100644 --- a/onnxruntime/test/python/transformers/test_gqa_cpu.py +++ b/onnxruntime/test/python/transformers/test_gqa_cpu.py @@ -1775,6 +1775,7 @@ def test_gqa_no_past(self): (2000, 2000), (200, 200), (240, 240), + (8000, 8000), ] ) num_h = [(32, 8), (9, 3), (4, 4)] if pipeline_mode else [(6, 6), (6, 3), (9, 9), (9, 3)] diff --git a/tools/ci_build/github/apple/build_apple_framework.py b/tools/ci_build/github/apple/build_apple_framework.py index e17bcd65d8814..3cd7a3af70622 100644 --- a/tools/ci_build/github/apple/build_apple_framework.py +++ b/tools/ci_build/github/apple/build_apple_framework.py @@ -187,7 +187,7 @@ def parse_args(): os.path.basename(__file__), description="""Create iOS framework and podspec for one or more osx_archs (xcframework) and building properties specified in the given build config file, see - tools/ci_build/github/apple/default_mobile_ios_framework_build_settings.json for details. + tools/ci_build/github/apple/default_full_apple_framework_build_settings.json for details. The output of the final xcframework and podspec can be found under [build_dir]/framework_out. Please note, this building script will only work on macOS. """, diff --git a/tools/ci_build/github/apple/default_full_apple_framework_build_settings.json b/tools/ci_build/github/apple/default_full_apple_framework_build_settings.json index 04a73ae450e5f..84d7e355ed5b4 100644 --- a/tools/ci_build/github/apple/default_full_apple_framework_build_settings.json +++ b/tools/ci_build/github/apple/default_full_apple_framework_build_settings.json @@ -28,11 +28,11 @@ ], "iphoneos": [ "--ios", - "--apple_deploy_target=12.0" + "--apple_deploy_target=13.0" ], "iphonesimulator": [ "--ios", - "--apple_deploy_target=12.0" + "--apple_deploy_target=13.0" ] } } diff --git a/tools/ci_build/github/apple/default_full_ios_framework_build_settings.json b/tools/ci_build/github/apple/default_full_ios_framework_build_settings.json index 4bc978956d7fc..e2d8f70c02cf3 100644 --- a/tools/ci_build/github/apple/default_full_ios_framework_build_settings.json +++ b/tools/ci_build/github/apple/default_full_ios_framework_build_settings.json @@ -24,13 +24,13 @@ "--ios", "--use_xcode", "--use_xnnpack", - "--apple_deploy_target=12.0" + "--apple_deploy_target=13.0" ], "iphonesimulator": [ "--ios", "--use_xcode", "--use_xnnpack", - "--apple_deploy_target=12.0" + "--apple_deploy_target=13.0" ], "macabi":[ "--macos=Catalyst", diff --git a/tools/ci_build/github/apple/default_mobile_ios_framework_build_settings.json b/tools/ci_build/github/apple/default_mobile_ios_framework_build_settings.json deleted file mode 100644 index 2bdf8de24f53c..0000000000000 --- a/tools/ci_build/github/apple/default_mobile_ios_framework_build_settings.json +++ /dev/null @@ -1,34 +0,0 @@ -{ - "build_osx_archs": { - "iphoneos": [ - "arm64" - ], - "iphonesimulator": [ - "arm64", - "x86_64" - ] - }, - "build_params": { - "base": [ - "--parallel", - "--use_xcode", - "--build_apple_framework", - "--minimal_build=extended", - "--disable_rtti", - "--disable_ml_ops", - "--disable_exceptions", - "--enable_reduced_operator_type_support", - "--use_coreml", - "--skip_tests", - "--cmake_extra_defines=onnxruntime_BUILD_UNIT_TESTS=OFF" - ], - "iphoneos": [ - "--ios", - "--apple_deploy_target=12.0" - ], - "iphonesimulator": [ - "--ios", - "--apple_deploy_target=12.0" - ] - } -} diff --git a/tools/ci_build/github/apple/default_training_ios_framework_build_settings.json b/tools/ci_build/github/apple/default_training_ios_framework_build_settings.json index 2066af7843e0a..1d4a8c038c07b 100644 --- a/tools/ci_build/github/apple/default_training_ios_framework_build_settings.json +++ b/tools/ci_build/github/apple/default_training_ios_framework_build_settings.json @@ -25,11 +25,11 @@ ], "iphoneos": [ "--ios", - "--apple_deploy_target=12.0" + "--apple_deploy_target=13.0" ], "iphonesimulator": [ "--ios", - "--apple_deploy_target=12.0" + "--apple_deploy_target=13.0" ], "macosx": [ "--macos=MacOSX", diff --git a/tools/ci_build/github/apple/test_minimal_training_ios_simulator_framework_build_settings.json b/tools/ci_build/github/apple/test_minimal_training_ios_simulator_framework_build_settings.json index 1a89d941e5e52..8f283173f1c6a 100644 --- a/tools/ci_build/github/apple/test_minimal_training_ios_simulator_framework_build_settings.json +++ b/tools/ci_build/github/apple/test_minimal_training_ios_simulator_framework_build_settings.json @@ -16,7 +16,7 @@ ], "iphonesimulator": [ "--ios", - "--apple_deploy_target=12.0" + "--apple_deploy_target=13.0" ] } } diff --git a/tools/ci_build/github/apple/use_ios_pods_with_custom_build.md b/tools/ci_build/github/apple/use_ios_pods_with_custom_build.md index c8da2eff57c33..9e5215a2dc25a 100644 --- a/tools/ci_build/github/apple/use_ios_pods_with_custom_build.md +++ b/tools/ci_build/github/apple/use_ios_pods_with_custom_build.md @@ -18,7 +18,7 @@ Run the script: python3 tools/ci_build/github/apple/build_and_assemble_apple_pods.py \ --staging-dir /path/to/staging/dir \ --include-ops-by-config /path/to/custom.config \ - --build-settings-file tools/ci_build/github/apple/default_mobile_ios_framework_build_settings.json + --build-settings-file tools/ci_build/github/apple/default_full_apple_framework_build_settings.json ``` This will do a custom build and create the pod package files for it in `/path/to/staging/dir`. diff --git a/tools/ci_build/github/azure-pipelines/android-arm64-v8a-QNN-crosscompile-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/android-arm64-v8a-QNN-crosscompile-ci-pipeline.yml index f488398293b7f..1703490992fb4 100644 --- a/tools/ci_build/github/azure-pipelines/android-arm64-v8a-QNN-crosscompile-ci-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/android-arm64-v8a-QNN-crosscompile-ci-pipeline.yml @@ -31,7 +31,7 @@ parameters: - name: QnnSdk displayName: QNN SDK version type: string - default: 2.21.0.240401 + default: 2.22.0.240425 jobs: - job: Build_QNN_EP diff --git a/tools/ci_build/github/azure-pipelines/bigmodels-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/bigmodels-ci-pipeline.yml index 67b56095962ab..f7500e0d805e2 100644 --- a/tools/ci_build/github/azure-pipelines/bigmodels-ci-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/bigmodels-ci-pipeline.yml @@ -49,7 +49,7 @@ resources: variables: - template: templates/common-variables.yml - name: docker_base_image - value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11:20240530.3 + value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11:20240531.1 - name: linux_trt_version value: 10.0.1.6-1.cuda11.8 - name: Repository @@ -172,6 +172,7 @@ stages: CLIP_MODEL_CACHE: $(Agent.TempDirectory)/clip_cache STABLE_DIFFUSION_MODEL_CACHE: $(Agent.TempDirectory)/stablediffusion_cache GenerateImage_DIR: $(Agent.TempDirectory)/images + hitAnother: 'False' workspace: clean: all pool: onnxruntime-Linux-GPU-A10-12G @@ -243,7 +244,7 @@ stages: -v $(CLIP_MODEL_CACHE):/model_cache:rw \ nvcr.io/nvidia/pytorch:22.11-py3 \ bash -c ' - set -ex; \ + set -x; \ python3 --version; \ python3 -m pip install --upgrade pip; \ pushd /workspace/onnxruntime/python/tools/transformers/models/stable_diffusion/; \ @@ -252,13 +253,19 @@ stages: python3 -m pip install -r requirements.txt; \ echo check demo_txt2image.py generate image; \ python3 -u check_image.py --image1 astronaut_riding_error.png --image2 $image2 --cache_dir /model_cache --negative; \ + if [ $? -ne 0 ]; then echo "Hit an unexpected image"; exit 1; fi; \ popd ; \ popd ; \ - ' + ' || ( echo "##vso[task.setvariable variable=hitAnother;]True" && exit 1 ) displayName: 'Check if the generated image is wierd' workingDirectory: $(Build.SourcesDirectory) + # If the generate image hit another test image, make the job status as warning continueOnError: true + - bash: | + echo "You can use variables: $(hitAnother)" + + # The step will execute if the gereneate image doesn't hit another test image - script: | docker run --rm --gpus all -v $PWD:/workspace \ -v $(CLIP_MODEL_CACHE):/model_cache:rw \ @@ -278,6 +285,7 @@ stages: ' displayName: 'Check the generated image' workingDirectory: $(Build.SourcesDirectory) + condition: ne(variables.hitAnother, 'True') - stage: Llama2_ONNX_FP16 dependsOn: diff --git a/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml b/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml index 3dce851d0e2cd..1dd0b3a5b2b97 100644 --- a/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml +++ b/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml @@ -71,7 +71,7 @@ parameters: - name: QnnSdk displayName: QNN SDK Version type: string - default: 2.21.0.240401 + default: 2.22.0.240425 resources: repositories: @@ -743,4 +743,4 @@ stages: displayName: 'Publish Pipeline NuGet Artifact' inputs: artifactName: 'drop-signed-nuget-qnn' - targetPath: '$(Build.ArtifactStagingDirectory)/nuget-artifact-merged' \ No newline at end of file + targetPath: '$(Build.ArtifactStagingDirectory)/nuget-artifact-merged' diff --git a/tools/ci_build/github/azure-pipelines/linux-gpu-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-gpu-ci-pipeline.yml index 6c512666803ba..48a0b7d6c23b7 100644 --- a/tools/ci_build/github/azure-pipelines/linux-gpu-ci-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/linux-gpu-ci-pipeline.yml @@ -48,9 +48,9 @@ parameters: variables: - name: docker_base_image ${{ if eq(parameters.CudaVersion, '11.8') }}: - value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11:20240530.3 + value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11:20240531.1 ${{ if eq(parameters.CudaVersion, '12.2') }}: - value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_2_x64_ubi8_gcc12:20240530.3 + value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_2_x64_ubi8_gcc12:20240531.1 - name: Repository ${{ if eq(parameters.CudaVersion, '11.8') }}: diff --git a/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-ci-pipeline.yml index c7f6c41c8dcc0..133af76357543 100644 --- a/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-ci-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-ci-pipeline.yml @@ -38,9 +38,9 @@ parameters: variables: - name: docker_base_image ${{ if eq(parameters.CudaVersion, '11.8') }}: - value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11:20240530.3 + value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11:20240531.1 ${{ if eq(parameters.CudaVersion, '12.2') }}: - value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_2_x64_ubi8_gcc12:20240530.3 + value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_2_x64_ubi8_gcc12:20240531.1 - name: linux_trt_version ${{ if eq(parameters.CudaVersion, '11.8') }}: value: 10.0.1.6-1.cuda11.8 diff --git a/tools/ci_build/github/azure-pipelines/linux-qnn-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-qnn-ci-pipeline.yml index 5fb3107ce5de7..a1339652a9495 100644 --- a/tools/ci_build/github/azure-pipelines/linux-qnn-ci-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/linux-qnn-ci-pipeline.yml @@ -32,11 +32,11 @@ parameters: - name: QnnSdk displayName: QNN SDK version type: string - default: 2.21.0.240401 + default: 2.22.0.240425 jobs: - job: Build_QNN_EP - pool: onnxruntime-qnn-ubuntu-2004-cpu + pool: onnxruntime-qnn-ubuntu-2204-cpu timeoutInMinutes: 60 workspace: clean: all diff --git a/tools/ci_build/github/azure-pipelines/mac-ios-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/mac-ios-ci-pipeline.yml index 255531681b039..0a19312790a98 100644 --- a/tools/ci_build/github/azure-pipelines/mac-ios-ci-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/mac-ios-ci-pipeline.yml @@ -58,7 +58,7 @@ jobs: --ios \ --apple_sysroot iphonesimulator \ --osx_arch x86_64 \ - --apple_deploy_target 12.0 \ + --apple_deploy_target=13.0 \ --use_xcode \ --config RelWithDebInfo \ --build_apple_framework \ diff --git a/tools/ci_build/github/azure-pipelines/mac-ios-packaging-pipeline.yml b/tools/ci_build/github/azure-pipelines/mac-ios-packaging-pipeline.yml index 881023e1c1186..c209e20adc131 100644 --- a/tools/ci_build/github/azure-pipelines/mac-ios-packaging-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/mac-ios-packaging-pipeline.yml @@ -89,10 +89,6 @@ stages: displayName: "Set common variables" name: SetCommonVariables -- template: templates/stages/mac-ios-packaging-build-stage.yml - parameters: - packageVariant: Mobile - - template: templates/stages/mac-ios-packaging-build-stage.yml parameters: packageVariant: Full diff --git a/tools/ci_build/github/azure-pipelines/post-merge-jobs.yml b/tools/ci_build/github/azure-pipelines/post-merge-jobs.yml index 38dc53cb5daf2..6901dcb7b68df 100644 --- a/tools/ci_build/github/azure-pipelines/post-merge-jobs.yml +++ b/tools/ci_build/github/azure-pipelines/post-merge-jobs.yml @@ -445,14 +445,14 @@ stages: python tools/ci_build/github/apple/build_apple_framework.py \ --build_dir "$(Build.BinariesDirectory)/ios_framework" \ --build_dynamic_framework \ - tools/ci_build/github/apple/default_mobile_ios_framework_build_settings.json + tools/ci_build/github/apple/default_full_apple_framework_build_settings.json displayName: "Build iOS dynamic framework" - script: | python tools/ci_build/github/apple/test_apple_packages.py \ --framework_info_file "$(Build.BinariesDirectory)/ios_framework/xcframework_info.json" \ --c_framework_dir "$(Build.BinariesDirectory)/ios_framework/framework_out" \ - --variant Mobile + --variant Full displayName: "Test pod with iOS framework" - stage: IosMinimalTrainingBuild diff --git a/tools/ci_build/github/azure-pipelines/py-package-test-pipeline.yml b/tools/ci_build/github/azure-pipelines/py-package-test-pipeline.yml index 3459ba6e48b23..63e70fa8e6488 100644 --- a/tools/ci_build/github/azure-pipelines/py-package-test-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/py-package-test-pipeline.yml @@ -54,7 +54,7 @@ stages: machine_pool: 'Onnxruntime-Linux-GPU' python_wheel_suffix: '_gpu' timeout: 480 - docker_base_image: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11:20240530.3 + docker_base_image: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11:20240531.1 trt_version: '10.0.1.6-1.cuda11.8' cuda_version: '11.8' diff --git a/tools/ci_build/github/azure-pipelines/py-packaging-pipeline.yml b/tools/ci_build/github/azure-pipelines/py-packaging-pipeline.yml index 1273194753ce2..c1fde9eff69b0 100644 --- a/tools/ci_build/github/azure-pipelines/py-packaging-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/py-packaging-pipeline.yml @@ -59,7 +59,7 @@ parameters: - name: qnn_sdk_version type: string displayName: 'QNN SDK version. Only for QNN packages.' - default: 2.21.0.240401 + default: 2.22.0.240425 trigger: none diff --git a/tools/ci_build/github/azure-pipelines/qnn-ep-nuget-packaging-pipeline.yml b/tools/ci_build/github/azure-pipelines/qnn-ep-nuget-packaging-pipeline.yml index 22169ea5463f5..e27a3bcda16c3 100644 --- a/tools/ci_build/github/azure-pipelines/qnn-ep-nuget-packaging-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/qnn-ep-nuget-packaging-pipeline.yml @@ -2,7 +2,7 @@ parameters: - name: QnnSdk displayName: QNN SDK Version type: string - default: 2.21.0.240401 + default: 2.22.0.240425 - name: build_config displayName: Build Configuration diff --git a/tools/ci_build/github/azure-pipelines/stages/java-cuda-packaging-stage.yml b/tools/ci_build/github/azure-pipelines/stages/java-cuda-packaging-stage.yml index e4483b736c3e5..1d5b810dfe726 100644 --- a/tools/ci_build/github/azure-pipelines/stages/java-cuda-packaging-stage.yml +++ b/tools/ci_build/github/azure-pipelines/stages/java-cuda-packaging-stage.yml @@ -10,10 +10,8 @@ stages: - stage: Jar_Packaging_GPU dependsOn: - Linux_C_API_Packaging_GPU -# Because Java Jar is published only after Windows Packaging GPU Testing stage we need to depend on the Testing stages -# TODO: change Windows_Packaging_*_Testing to Windows_Packaging_* once we finish PRODUCT BACKLOG ITEM 34666 - - Windows_Packaging_CUDA_Testing - - Windows_Packaging_TensorRT_Testing + - Windows_Packaging_CUDA + - Windows_Packaging_TensorRT - Download_Java_Tools jobs: - job: Jar_Packaging_GPU @@ -141,9 +139,9 @@ stages: value: false - name: docker_base_image ${{ if eq(parameters.CudaVersion, '11.8') }}: - value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11:20240530.3 + value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11:20240531.1 ${{ if eq(parameters.CudaVersion, '12.2') }}: - value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_2_x64_ubi8_gcc12:20240530.3 + value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_2_x64_ubi8_gcc12:20240531.1 timeoutInMinutes: 60 steps: diff --git a/tools/ci_build/github/azure-pipelines/stages/nuget-linux-cuda-packaging-stage.yml b/tools/ci_build/github/azure-pipelines/stages/nuget-linux-cuda-packaging-stage.yml index 7007c7636da6a..cca53e36ebab9 100644 --- a/tools/ci_build/github/azure-pipelines/stages/nuget-linux-cuda-packaging-stage.yml +++ b/tools/ci_build/github/azure-pipelines/stages/nuget-linux-cuda-packaging-stage.yml @@ -26,20 +26,14 @@ stages: value: '12' - name: CUDA_VERSION value: ${{ parameters.CudaVersion }} - - name: docker_base_image - ${{ if eq(parameters.CudaVersion, '11.8') }}: - value: nvidia/cuda:11.8.0-cudnn8-devel-ubi8 - ${{ if eq(parameters.CudaVersion, '12.2') }}: - value: nvidia/cuda:12.2.2-cudnn8-devel-ubi8 steps: - template: ../templates/set-version-number-variables-step.yml - template: ../templates/get-docker-image-steps.yml parameters: - Dockerfile: tools/ci_build/github/linux/docker/inference/x86_64/default/gpu/Dockerfile - Context: tools/ci_build/github/linux/docker/inference/x86_64/default/gpu + Dockerfile: tools/ci_build/github/linux/docker/inference/x86_64/default/cuda${{ variables.CUDA_VERSION_MAJOR }}/Dockerfile + Context: tools/ci_build/github/linux/docker/inference/x86_64/default/cuda${{ variables.CUDA_VERSION_MAJOR }} DockerBuildArgs: " --build-arg BUILD_UID=$( id -u ) - --build-arg BASEIMAGE=${{ parameters.docker_base_image }} " Repository: onnxruntimecuda${{ variables.CUDA_VERSION_MAJOR }}build @@ -89,21 +83,15 @@ stages: value: 10.0.1.6-1.cuda11.8 ${{ if eq(parameters.CudaVersion, '12.2') }}: value: 10.0.1.6-1.cuda12.4 - - name: docker_base_image - ${{ if eq(parameters.CudaVersion, '11.8') }}: - value: nvidia/cuda:11.8.0-cudnn8-devel-ubi8 - ${{ if eq(parameters.CudaVersion, '12.2') }}: - value: nvidia/cuda:12.2.2-cudnn8-devel-ubi8 steps: - checkout: self clean: true submodules: recursive - template: ../templates/get-docker-image-steps.yml parameters: - Dockerfile: tools/ci_build/github/linux/docker/inference/x86_64/default/gpu/Dockerfile - Context: tools/ci_build/github/linux/docker/inference/x86_64/default/gpu + Dockerfile: tools/ci_build/github/linux/docker/inference/x86_64/default/cuda${{ variables.CUDA_VERSION_MAJOR }}/Dockerfile + Context: tools/ci_build/github/linux/docker/inference/x86_64/default/cuda${{ variables.CUDA_VERSION_MAJOR }} DockerBuildArgs: " - --build-arg BASEIMAGE=${{ variables.docker_base_image }} --build-arg TRT_VERSION=${{ variables.linux_trt_version }} --build-arg BUILD_UID=$( id -u ) " @@ -164,11 +152,6 @@ stages: value: 10.0.1.6-1.cuda11.8 ${{ if eq(parameters.CudaVersion, '12.2') }}: value: 10.0.1.6-1.cuda12.4 - - name: docker_base_image - ${{ if eq(parameters.CudaVersion, '11.8') }}: - value: nvidia/cuda:11.8.0-cudnn8-devel-ubi8 - ${{ if eq(parameters.CudaVersion, '12.2') }}: - value: nvidia/cuda:12.2.2-cudnn8-devel-ubi8 steps: - checkout: self # due to checkout multiple repos, the root directory is $(Build.SourcesDirectory)/onnxruntime submodules: false @@ -182,8 +165,8 @@ stages: - template: ../templates/get-docker-image-steps.yml parameters: ScriptName: $(Build.SourcesDirectory)/onnxruntime/tools/ci_build/get_docker_image.py - Dockerfile: $(Build.SourcesDirectory)/onnxruntime/tools/ci_build/github/linux/docker/inference/x86_64/default/gpu/Dockerfile - Context: $(Build.SourcesDirectory)/onnxruntime/tools/ci_build/github/linux/docker/inference/x86_64/default/gpu + Dockerfile: $(Build.SourcesDirectory)/onnxruntime/tools/ci_build/github/linux/docker/inference/x86_64/default/cuda${{ variables.CUDA_VERSION_MAJOR }}/Dockerfile + Context: $(Build.SourcesDirectory)/onnxruntime/tools/ci_build/github/linux/docker/inference/x86_64/default/cuda${{ variables.CUDA_VERSION_MAJOR }} DockerBuildArgs: "--build-arg BASEIMAGE=${{ variables.docker_base_image }} --build-arg TRT_VERSION=${{ variables.linux_trt_version }} --build-arg BUILD_UID=$( id -u )" Repository: onnxruntimecuda${{ variables.CUDA_VERSION_MAJOR }}xtrt86build UpdateDepsTxt: false diff --git a/tools/ci_build/github/azure-pipelines/stages/py-cuda-packaging-stage.yml b/tools/ci_build/github/azure-pipelines/stages/py-cuda-packaging-stage.yml index 45b124b60ab23..9c5282af47c5a 100644 --- a/tools/ci_build/github/azure-pipelines/stages/py-cuda-packaging-stage.yml +++ b/tools/ci_build/github/azure-pipelines/stages/py-cuda-packaging-stage.yml @@ -78,8 +78,8 @@ stages: cmake_build_type: ${{ parameters.cmake_build_type }} cuda_version: ${{ parameters.cuda_version }} ${{ if eq(parameters.cuda_version, '11.8') }}: - docker_base_image: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11:20240530.3 + docker_base_image: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11:20240531.1 trt_version: 10.0.1.6-1.cuda11.8 ${{ if eq(parameters.cuda_version, '12.2') }}: - docker_base_image: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_2_x64_ubi8_gcc12:20240530.3 + docker_base_image: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_2_x64_ubi8_gcc12:20240531.1 trt_version: 10.0.1.6-1.cuda12.4 diff --git a/tools/ci_build/github/azure-pipelines/templates/component-governance-component-detection-steps.yml b/tools/ci_build/github/azure-pipelines/templates/component-governance-component-detection-steps.yml index 7945de295f92c..62785b6413e6a 100644 --- a/tools/ci_build/github/azure-pipelines/templates/component-governance-component-detection-steps.yml +++ b/tools/ci_build/github/azure-pipelines/templates/component-governance-component-detection-steps.yml @@ -29,6 +29,7 @@ steps: ignoreDirectories: '$(Build.Repository.LocalPath)/cmake/external/emsdk/upstream/emscripten/tests, $(Build.Repository.LocalPath)/cmake/external/onnx/third_party, + $(Build.SourcesDirectory)/cmake/external/onnx/third_party, $(Build.Repository.LocalPath)/cmake/external/onnxruntime-extensions, $(Build.Repository.LocalPath)/js/react_native/e2e/node_modules, $(Build.SourcesDirectory)/onnxruntime-inference-examples, diff --git a/tools/ci_build/github/azure-pipelines/templates/download-deps.yml b/tools/ci_build/github/azure-pipelines/templates/download-deps.yml index 9d0a8b42a21ca..e7b230008dad4 100644 --- a/tools/ci_build/github/azure-pipelines/templates/download-deps.yml +++ b/tools/ci_build/github/azure-pipelines/templates/download-deps.yml @@ -11,7 +11,7 @@ steps: packageType: upack feed: '/7424c8e4-5c62-490e-95c4-79446f31017c' definition: '517c4f6f-5437-4392-a70d-4f15ec5be2f0' - version: 1.0.155 + version: 1.0.156 downloadPath: $(Build.BinariesDirectory)/deps # The private ADO project @@ -22,7 +22,7 @@ steps: packageType: upack feed: '/4c7631f5-24c0-4307-8822-1aa8f180c325' definition: 'fd9dd5ad-b73e-4678-890e-edcf680dbc1a' - version: 1.0.155 + version: 1.0.156 downloadPath: $(Build.BinariesDirectory)/deps # You can add more ADO accounts at here. diff --git a/tools/ci_build/github/azure-pipelines/templates/jobs/download_linux_qnn_sdk.yml b/tools/ci_build/github/azure-pipelines/templates/jobs/download_linux_qnn_sdk.yml index 232ba23c7bebb..236998407ad16 100644 --- a/tools/ci_build/github/azure-pipelines/templates/jobs/download_linux_qnn_sdk.yml +++ b/tools/ci_build/github/azure-pipelines/templates/jobs/download_linux_qnn_sdk.yml @@ -1,7 +1,7 @@ parameters: - name: QnnSDKVersion type: string - default: '2.21.0.240401' + default: '2.22.0.240425' steps: - script: | diff --git a/tools/ci_build/github/azure-pipelines/templates/jobs/download_win_qnn_sdk.yml b/tools/ci_build/github/azure-pipelines/templates/jobs/download_win_qnn_sdk.yml index c6db7bdb449e2..0f43dfc497dff 100644 --- a/tools/ci_build/github/azure-pipelines/templates/jobs/download_win_qnn_sdk.yml +++ b/tools/ci_build/github/azure-pipelines/templates/jobs/download_win_qnn_sdk.yml @@ -1,7 +1,7 @@ parameters: - name: QnnSDKVersion type: string - default: '2.21.0.240401' + default: '2.22.0.240425' steps: - powershell: | diff --git a/tools/ci_build/github/azure-pipelines/templates/make_java_win_binaries.yml b/tools/ci_build/github/azure-pipelines/templates/make_java_win_binaries.yml index 756a7a48343a3..6c77678ce15d8 100644 --- a/tools/ci_build/github/azure-pipelines/templates/make_java_win_binaries.yml +++ b/tools/ci_build/github/azure-pipelines/templates/make_java_win_binaries.yml @@ -3,16 +3,24 @@ parameters: type: string - name: java_artifact_id type: string + - name: buildOnly + type: boolean steps: + - task: CmdLine@2 + displayName: 'Gradle cmakeCheck' + continueOnError: ${{ parameters.buildOnly }} + inputs: + script: | + @echo on + call gradlew.bat cmakeCheck -DcmakeBuildDir=$(Build.BinariesDirectory)\RelWithDebInfo + workingDirectory: $(Build.SourcesDirectory)\java + - task: CmdLine@2 displayName: 'Add symbols and notices to Java' inputs: script: | @echo on - cd $(Build.SourcesDirectory)\java - call $(Build.SourcesDirectory)\java\gradlew.bat cmakeCheck -DcmakeBuildDir=$(Build.BinariesDirectory)\RelWithDebInfo - if %errorlevel% neq 0 exit /b %errorlevel% cd $(Build.BinariesDirectory)\RelWithDebInfo set NATIVE_FOLDER=$(Build.BinariesDirectory)\onnxruntime-java-win-${{ parameters.msbuildPlatform }}\stage\ai\onnxruntime\native\win-x64 mkdir %NATIVE_FOLDER% diff --git a/tools/ci_build/github/azure-pipelines/templates/py-packaging-stage.yml b/tools/ci_build/github/azure-pipelines/templates/py-packaging-stage.yml index 85f05eed27ae1..f2bd0e6f169e9 100644 --- a/tools/ci_build/github/azure-pipelines/templates/py-packaging-stage.yml +++ b/tools/ci_build/github/azure-pipelines/templates/py-packaging-stage.yml @@ -60,7 +60,7 @@ parameters: - name: qnn_sdk_version type: string displayName: 'QNN SDK version. Only for QNN packages.' - default: 2.21.0.240401 + default: 2.22.0.240425 stages: - ${{ if eq(parameters.enable_windows_cpu, true) }}: @@ -472,7 +472,7 @@ stages: parameters: arch: 'x86_64' machine_pool: 'onnxruntime-Ubuntu2204-AMD-CPU' - docker_base_image: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11:20240530.3 + docker_base_image: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11:20240531.1 extra_build_arg: ${{ parameters.build_py_parameters }} cmake_build_type: ${{ parameters.cmake_build_type }} trt_version: '10.0.1.6-1.cuda11.8' diff --git a/tools/ci_build/github/azure-pipelines/templates/py-win-arm64-qnn.yml b/tools/ci_build/github/azure-pipelines/templates/py-win-arm64-qnn.yml index 4a695e1f3c43d..32fdf4819bd88 100644 --- a/tools/ci_build/github/azure-pipelines/templates/py-win-arm64-qnn.yml +++ b/tools/ci_build/github/azure-pipelines/templates/py-win-arm64-qnn.yml @@ -7,7 +7,7 @@ parameters: - name: QNN_SDK displayName: QNN SDK Version type: string - default: 2.21.0.240401 + default: 2.22.0.240425 - name: PYTHON_VERSION type: string diff --git a/tools/ci_build/github/azure-pipelines/templates/py-win-x64-qnn.yml b/tools/ci_build/github/azure-pipelines/templates/py-win-x64-qnn.yml index dfebf17d95aa2..668e51c828dcd 100644 --- a/tools/ci_build/github/azure-pipelines/templates/py-win-x64-qnn.yml +++ b/tools/ci_build/github/azure-pipelines/templates/py-win-x64-qnn.yml @@ -7,7 +7,7 @@ parameters: - name: QNN_SDK displayName: QNN SDK Version type: string - default: 2.21.0.240401 + default: 2.22.0.240425 - name: ENV_SETUP_SCRIPT type: string diff --git a/tools/ci_build/github/azure-pipelines/templates/qnn-ep-win.yml b/tools/ci_build/github/azure-pipelines/templates/qnn-ep-win.yml index e30a3f5ba2d8d..f75bb89b9ad48 100644 --- a/tools/ci_build/github/azure-pipelines/templates/qnn-ep-win.yml +++ b/tools/ci_build/github/azure-pipelines/templates/qnn-ep-win.yml @@ -1,5 +1,5 @@ parameters: - QnnSdk: '2.21.0.240401' + QnnSdk: '2.22.0.240425' build_config: 'RelWithDebInfo' IsReleaseBuild: false DoEsrp: false diff --git a/tools/ci_build/github/azure-pipelines/templates/stages/mac-ios-packaging-build-stage.yml b/tools/ci_build/github/azure-pipelines/templates/stages/mac-ios-packaging-build-stage.yml index 2c9f968380a38..a1ae63e606526 100644 --- a/tools/ci_build/github/azure-pipelines/templates/stages/mac-ios-packaging-build-stage.yml +++ b/tools/ci_build/github/azure-pipelines/templates/stages/mac-ios-packaging-build-stage.yml @@ -2,7 +2,6 @@ parameters: - name: packageVariant type: string values: - - Mobile - Full - Training @@ -22,12 +21,6 @@ stages: xcodeVersion: "14.2" ortPodVersion: $[stageDependencies.IosPackaging_SetCommonVariables.j.outputs['SetCommonVariables.ORT_POD_VERSION']] - ${{ if eq(parameters.packageVariant, 'Mobile') }}: - buildSettingsFile: "tools/ci_build/github/apple/default_mobile_ios_framework_build_settings.json" - optionalIncludeOpsByConfigOption: "--include-ops-by-config tools/ci_build/github/android/mobile_package.required_operators.config" - cPodName: onnxruntime-mobile-c - objcPodName: onnxruntime-mobile-objc - ${{ if eq(parameters.packageVariant, 'Full') }}: buildSettingsFile: "tools/ci_build/github/apple/default_full_apple_framework_build_settings.json" cPodName: onnxruntime-c diff --git a/tools/ci_build/github/azure-pipelines/templates/win-ci.yml b/tools/ci_build/github/azure-pipelines/templates/win-ci.yml index 35c23e311ed5d..c726054d8eb10 100644 --- a/tools/ci_build/github/azure-pipelines/templates/win-ci.yml +++ b/tools/ci_build/github/azure-pipelines/templates/win-ci.yml @@ -191,19 +191,24 @@ stages: createLogFile: true # For CPU job, tests are run in the same machine as building + - ${{ if eq(parameters.buildJava, 'true') }}: + - template: make_java_win_binaries.yml + parameters: + msbuildPlatform: ${{ parameters.msbuildPlatform }} + java_artifact_id: ${{ parameters.java_artifact_id }} + ${{ if contains(parameters.ort_build_pool_name, 'CPU') }}: + buildOnly: false + # When it is a GPU build, we only assemble the java binaries, testing will be done in the later stage with GPU machine + ${{ else }}: + buildOnly: true + + - task: PublishBuildArtifacts@1 + displayName: 'Publish Java temp binaries' + inputs: + pathtoPublish: '$(Build.BinariesDirectory)\onnxruntime-java-win-${{ parameters.msbuildPlatform }}' + artifactName: 'drop-onnxruntime-java-win-${{ parameters.packageName }}${{parameters.artifact_name_suffix}}' + # All GPU builds will be tested in the next stage with GPU machine - ${{ if contains(parameters.ort_build_pool_name, 'CPU') }}: - - ${{ if eq(parameters.buildJava, 'true') }}: - - template: make_java_win_binaries.yml - parameters: - msbuildPlatform: ${{ parameters.msbuildPlatform }} - java_artifact_id: ${{ parameters.java_artifact_id }} - - - task: PublishBuildArtifacts@1 - condition: and(succeeded(), eq('${{ parameters.buildJava}}', true)) - displayName: 'Publish Java temp binaries' - inputs: - pathtoPublish: '$(Build.BinariesDirectory)\onnxruntime-java-win-${{ parameters.msbuildPlatform }}' - artifactName: 'drop-onnxruntime-java-win-${{ parameters.packageName }}${{parameters.artifact_name_suffix}}' - task: PythonScript@0 displayName: 'test' condition: and(succeeded(), eq('${{ parameters.runTests}}', true)) @@ -386,16 +391,10 @@ stages: scriptPath: '$(Build.SourcesDirectory)\tools\ci_build\build.py' arguments: '--config RelWithDebInfo --use_binskim_compliant_compile_flags --enable_lto --disable_rtti --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --build_shared_lib --test --enable_onnx_tests $(TelemetryOption) ' workingDirectory: '$(Build.BinariesDirectory)' - +# Previous stage only assembles the java binaries, testing will be done in this stage with GPU machine - ${{ if eq(parameters.buildJava, 'true') }}: - template: make_java_win_binaries.yml parameters: msbuildPlatform: ${{ parameters.msbuildPlatform }} java_artifact_id: ${{ parameters.java_artifact_id }} - - - task: PublishBuildArtifacts@1 - condition: and(succeeded(), eq('${{ parameters.buildJava}}', true)) - displayName: 'Publish Java temp binaries' - inputs: - pathtoPublish: '$(Build.BinariesDirectory)\onnxruntime-java-win-${{ parameters.msbuildPlatform }}' - artifactName: 'drop-onnxruntime-java-win-${{ parameters.packageName }}${{parameters.artifact_name_suffix}}' + buildOnly: false \ No newline at end of file diff --git a/tools/ci_build/github/azure-pipelines/win-qnn-arm64-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/win-qnn-arm64-ci-pipeline.yml index a32f2a8a27660..0053a4a64ee02 100644 --- a/tools/ci_build/github/azure-pipelines/win-qnn-arm64-ci-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/win-qnn-arm64-ci-pipeline.yml @@ -32,7 +32,7 @@ parameters: - name: QnnSdk displayName: QNN SDK version type: string - default: 2.21.0.240401 + default: 2.22.0.240425 jobs: - job: 'build' diff --git a/tools/ci_build/github/azure-pipelines/win-qnn-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/win-qnn-ci-pipeline.yml index 165c01767964f..ede7b3d336768 100644 --- a/tools/ci_build/github/azure-pipelines/win-qnn-ci-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/win-qnn-ci-pipeline.yml @@ -32,7 +32,7 @@ parameters: - name: QnnSdk displayName: QNN SDK version type: string - default: 2.21.0.240401 + default: 2.22.0.240425 jobs: - job: 'build' @@ -90,12 +90,14 @@ jobs: workingDirectory: '$(Build.BinariesDirectory)\$(BuildConfig)\$(BuildConfig)' displayName: 'Run unit tests' - - script: | - .\$(BuildConfig)\onnx_test_runner -j 1 -v -e qnn -i "backend_path|$(QnnSDKRootDir)\lib\x86_64-windows-msvc\QnnCpu.dll" $(Build.SourcesDirectory)\cmake\external\onnx\onnx\backend\test\data\node - workingDirectory: '$(Build.BinariesDirectory)\$(BuildConfig)' - displayName: 'Run ONNX Tests' - - - script: | - .\$(BuildConfig)\onnx_test_runner -j 1 -v -e qnn -i "backend_path|$(QnnSDKRootDir)\lib\x86_64-windows-msvc\QnnCpu.dll" C:\data\float32_models - workingDirectory: '$(Build.BinariesDirectory)\$(BuildConfig)' - displayName: 'Run float32 model tests' + # Comment out QnnCpu tests because QNN SDK 2.22 CPU backend crashes when executing MatMuls. + # Does not happen with HTP backend. + # - script: | + # .\$(BuildConfig)\onnx_test_runner -j 1 -v -e qnn -i "backend_path|$(QnnSDKRootDir)\lib\x86_64-windows-msvc\QnnCpu.dll" $(Build.SourcesDirectory)\cmake\external\onnx\onnx\backend\test\data\node + # workingDirectory: '$(Build.BinariesDirectory)\$(BuildConfig)' + # displayName: 'Run ONNX Tests' + # + # - script: | + # .\$(BuildConfig)\onnx_test_runner -j 1 -v -e qnn -i "backend_path|$(QnnSDKRootDir)\lib\x86_64-windows-msvc\QnnCpu.dll" C:\data\float32_models + # workingDirectory: '$(Build.BinariesDirectory)\$(BuildConfig)' + # displayName: 'Run float32 model tests' diff --git a/tools/ci_build/github/js/react_native_e2e_full_ios_framework_build_settings.json b/tools/ci_build/github/js/react_native_e2e_full_ios_framework_build_settings.json index 78de7edb5ec29..a1266a80d1cd9 100644 --- a/tools/ci_build/github/js/react_native_e2e_full_ios_framework_build_settings.json +++ b/tools/ci_build/github/js/react_native_e2e_full_ios_framework_build_settings.json @@ -14,7 +14,7 @@ ], "iphonesimulator": [ "--ios", - "--apple_deploy_target=12.0" + "--apple_deploy_target=13.0" ] } } diff --git a/tools/ci_build/github/js/react_native_e2e_mobile_ios_framework_build_settings.json b/tools/ci_build/github/js/react_native_e2e_mobile_ios_framework_build_settings.json index 3d80231393cc6..73ff98f921482 100644 --- a/tools/ci_build/github/js/react_native_e2e_mobile_ios_framework_build_settings.json +++ b/tools/ci_build/github/js/react_native_e2e_mobile_ios_framework_build_settings.json @@ -19,7 +19,7 @@ ], "iphonesimulator": [ "--ios", - "--apple_deploy_target=12.0" + "--apple_deploy_target=13.0" ] } } diff --git a/tools/ci_build/github/linux/docker/Dockerfile.aten_cpu b/tools/ci_build/github/linux/docker/Dockerfile.aten_cpu index 89a0a3c70eaa1..ad1db6a0305ec 100644 --- a/tools/ci_build/github/linux/docker/Dockerfile.aten_cpu +++ b/tools/ci_build/github/linux/docker/Dockerfile.aten_cpu @@ -1,4 +1,4 @@ -FROM onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cpu_x64_ubi8_gcc12:20240530.3 +FROM onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cpu_x64_ubi8_gcc12:20240531.1 ADD scripts /tmp/scripts RUN cd /tmp/scripts && /tmp/scripts/manylinux/install_centos.sh && /tmp/scripts/manylinux/install_deps_aten.sh && rm -rf /tmp/scripts diff --git a/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cpu b/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cpu index cb42db3021f80..9bdc62ace4793 100644 --- a/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cpu +++ b/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cpu @@ -1,4 +1,4 @@ -FROM onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cpu_x64_ubi8_gcc12:20240530.3 +FROM onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cpu_x64_ubi8_gcc12:20240531.1 ENV JAVA_HOME=/usr/lib/jvm/msopenjdk-11 diff --git a/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_training_cuda11_8 b/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_training_cuda11_8 index 3eb6b506bebe3..ed920ea057393 100644 --- a/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_training_cuda11_8 +++ b/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_training_cuda11_8 @@ -1,4 +1,4 @@ -FROM onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11:20240530.3 +FROM onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11:20240531.1 ARG PYTHON_VERSION=3.9 ARG TORCH_VERSION=2.0.0 ARG OPSET_VERSION=17 diff --git a/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_training_cuda12_2 b/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_training_cuda12_2 index ee1fbb2d4f042..ba5cb245eb3e4 100644 --- a/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_training_cuda12_2 +++ b/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_training_cuda12_2 @@ -1,4 +1,4 @@ -FROM onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_2_x64_ubi8_gcc12:20240530.3 +FROM onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_2_x64_ubi8_gcc12:20240531.1 ARG PYTHON_VERSION=3.9 ARG TORCH_VERSION=2.1.0 diff --git a/tools/ci_build/github/linux/docker/inference/aarch64/python/cpu/Dockerfile b/tools/ci_build/github/linux/docker/inference/aarch64/python/cpu/Dockerfile index e3addf6e2e3a2..9a74788300ec9 100644 --- a/tools/ci_build/github/linux/docker/inference/aarch64/python/cpu/Dockerfile +++ b/tools/ci_build/github/linux/docker/inference/aarch64/python/cpu/Dockerfile @@ -1,4 +1,4 @@ -FROM onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cpu_aarch64_ubi8_gcc12:20240530.3 +FROM onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cpu_aarch64_ubi8_gcc12:20240531.1 ADD scripts /tmp/scripts RUN cd /tmp/scripts && /tmp/scripts/install_centos.sh && /tmp/scripts/install_deps.sh && rm -rf /tmp/scripts diff --git a/tools/ci_build/github/linux/docker/inference/aarch64/python/cpu/scripts/requirements.txt b/tools/ci_build/github/linux/docker/inference/aarch64/python/cpu/scripts/requirements.txt index 8f56ee18ccd24..cc47718f78a46 100644 --- a/tools/ci_build/github/linux/docker/inference/aarch64/python/cpu/scripts/requirements.txt +++ b/tools/ci_build/github/linux/docker/inference/aarch64/python/cpu/scripts/requirements.txt @@ -5,7 +5,7 @@ mypy pytest setuptools>=68.2.2 wheel -onnx==1.16.0 +onnx==1.16.1 protobuf==4.21.12 sympy==1.12 flatbuffers diff --git a/tools/ci_build/github/linux/docker/inference/x86_64/default/gpu/Dockerfile b/tools/ci_build/github/linux/docker/inference/x86_64/default/cuda11/Dockerfile similarity index 77% rename from tools/ci_build/github/linux/docker/inference/x86_64/default/gpu/Dockerfile rename to tools/ci_build/github/linux/docker/inference/x86_64/default/cuda11/Dockerfile index 0ca9dbe27af9d..051f9cc6a267f 100644 --- a/tools/ci_build/github/linux/docker/inference/x86_64/default/gpu/Dockerfile +++ b/tools/ci_build/github/linux/docker/inference/x86_64/default/cuda11/Dockerfile @@ -2,10 +2,10 @@ # Licensed under the MIT License. # This file is used by Zip-Nuget Packaging NoContribOps Pipeline,Zip-Nuget-Java Packaging Pipeline -ARG BASEIMAGE=nvidia/cuda:11.8.0-cudnn8-devel-ubi8 -FROM $BASEIMAGE -ARG TRT_VERSION +FROM onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11_dotnet:20240531.1 +ARG TRT_VERSION +RUN rpm -Uvh https://packages.microsoft.com/config/centos/8/packages-microsoft-prod.rpm && dnf install -y msopenjdk-11 #Install TensorRT only if TRT_VERSION is not empty RUN if [ -n "$TRT_VERSION" ]; then \ echo "TRT_VERSION is $TRT_VERSION" && \ @@ -31,13 +31,13 @@ else \ echo "TRT_VERSION is none skipping Tensor RT Installation" ; \ fi -ENV PATH /usr/lib/jvm/msopenjdk-11/bin:/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin +ENV PATH /usr/lib/jvm/msopenjdk-11/bin:$PATH ENV LANG=en_US.UTF-8 ENV LC_ALL=en_US.UTF-8 ENV JAVA_HOME=/usr/lib/jvm/msopenjdk-11 - +ENV CUDAHOSTCXX /opt/rh/gcc-toolset-11/root/usr/bin/g++ ADD scripts /tmp/scripts -RUN cd /tmp/scripts && /tmp/scripts/install_centos.sh && /tmp/scripts/install_deps.sh && rm -rf /tmp/scripts +RUN cd /tmp/scripts && /tmp/scripts/install_deps.sh && rm -rf /tmp/scripts ARG BUILD_UID=1001 ARG BUILD_USER=onnxruntimedev diff --git a/tools/ci_build/github/linux/docker/inference/x86_64/default/gpu/scripts/install_deps.sh b/tools/ci_build/github/linux/docker/inference/x86_64/default/cuda11/scripts/install_deps.sh similarity index 89% rename from tools/ci_build/github/linux/docker/inference/x86_64/default/gpu/scripts/install_deps.sh rename to tools/ci_build/github/linux/docker/inference/x86_64/default/cuda11/scripts/install_deps.sh index eb6d3315b97ef..3c88c516bee4e 100755 --- a/tools/ci_build/github/linux/docker/inference/x86_64/default/gpu/scripts/install_deps.sh +++ b/tools/ci_build/github/linux/docker/inference/x86_64/default/cuda11/scripts/install_deps.sh @@ -39,8 +39,8 @@ mkdir -p /tmp/src cd /tmp/src echo "Installing cmake" -GetFile https://github.com/Kitware/CMake/releases/download/v3.26.3/cmake-3.26.3-linux-`uname -m`.tar.gz /tmp/src/cmake-3.26.3-linux-`uname -m`.tar.gz -tar -zxf /tmp/src/cmake-3.26.3-linux-`uname -m`.tar.gz --strip=1 -C /usr +GetFile https://github.com/Kitware/CMake/releases/download/v3.29.3/cmake-3.29.3-linux-`uname -m`.tar.gz /tmp/src/cmake-3.29.3-linux-`uname -m`.tar.gz +tar -zxf /tmp/src/cmake-3.29.3-linux-`uname -m`.tar.gz --strip=1 -C /usr echo "Installing Ninja" GetFile https://github.com/ninja-build/ninja/archive/v1.10.0.tar.gz /tmp/src/ninja-linux.tar.gz diff --git a/tools/ci_build/github/linux/docker/inference/x86_64/default/cuda12/Dockerfile b/tools/ci_build/github/linux/docker/inference/x86_64/default/cuda12/Dockerfile new file mode 100644 index 0000000000000..a86b96b7adf42 --- /dev/null +++ b/tools/ci_build/github/linux/docker/inference/x86_64/default/cuda12/Dockerfile @@ -0,0 +1,48 @@ +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. + +# This file is used by Zip-Nuget Packaging NoContribOps Pipeline,Zip-Nuget-Java Packaging Pipeline +FROM onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_2_x64_ubi8_gcc12_dotnet:20240531.1 +ARG TRT_VERSION + +#Install TensorRT only if TRT_VERSION is not empty +RUN if [ -n "$TRT_VERSION" ]; then \ + echo "TRT_VERSION is $TRT_VERSION" && \ + dnf -y install \ + libnvinfer10-${TRT_VERSION} \ + libnvinfer-headers-devel-${TRT_VERSION} \ + libnvinfer-devel-${TRT_VERSION} \ + libnvinfer-lean10-${TRT_VERSION} \ + libnvonnxparsers10-${TRT_VERSION} \ + libnvonnxparsers-devel-${TRT_VERSION} \ + libnvinfer-dispatch10-${TRT_VERSION} \ + libnvinfer-plugin10-${TRT_VERSION} \ + libnvinfer-vc-plugin10-${TRT_VERSION} \ + libnvinfer-bin-${TRT_VERSION} \ + libnvinfer-plugin10-${TRT_VERSION} \ + libnvinfer-plugin-devel-${TRT_VERSION} \ + libnvinfer-vc-plugin-devel-${TRT_VERSION} \ + libnvinfer-lean-devel-${TRT_VERSION} \ + libnvinfer-dispatch-devel-${TRT_VERSION} \ + libnvinfer-headers-plugin-devel-${TRT_VERSION} && \ + dnf clean dbcache ; \ +else \ + echo "TRT_VERSION is none skipping Tensor RT Installation" ; \ +fi + + + +ENV LANG=en_US.UTF-8 +ENV LC_ALL=en_US.UTF-8 + +ENV CUDAHOSTCXX /opt/rh/gcc-toolset-12/root/usr/bin/g++ +ADD scripts /tmp/scripts +RUN sed -i 's/enabled\s*=\s*1/enabled = 1\nexclude=dotnet* aspnet* netstandard*/g' /etc/yum.repos.d/ubi.repo && \ + rpm -Uvh https://packages.microsoft.com/config/centos/8/packages-microsoft-prod.rpm && dnf install -y msopenjdk-11 && cd /tmp/scripts && /tmp/scripts/install_deps.sh && rm -rf /tmp/scripts +ENV PATH /usr/lib/jvm/msopenjdk-11/bin:$PATH +ENV JAVA_HOME=/usr/lib/jvm/msopenjdk-11 +ARG BUILD_UID=1001 +ARG BUILD_USER=onnxruntimedev +RUN adduser --uid $BUILD_UID $BUILD_USER +WORKDIR /home/$BUILD_USER +USER $BUILD_USER diff --git a/tools/ci_build/github/linux/docker/inference/x86_64/default/cuda12/scripts/install_deps.sh b/tools/ci_build/github/linux/docker/inference/x86_64/default/cuda12/scripts/install_deps.sh new file mode 100755 index 0000000000000..3c88c516bee4e --- /dev/null +++ b/tools/ci_build/github/linux/docker/inference/x86_64/default/cuda12/scripts/install_deps.sh @@ -0,0 +1,68 @@ +#!/bin/bash +set -e -x + +# Download a file from internet +function GetFile { + local uri=$1 + local path=$2 + local force=${3:-false} + local download_retries=${4:-5} + local retry_wait_time_seconds=${5:-30} + + if [[ -f $path ]]; then + if [[ $force = false ]]; then + echo "File '$path' already exists. Skipping download" + return 0 + else + rm -rf $path + fi + fi + + if [[ -f $uri ]]; then + echo "'$uri' is a file path, copying file to '$path'" + cp $uri $path + return $? + fi + + echo "Downloading $uri" + # Use aria2c if available, otherwise use curl + if command -v aria2c > /dev/null; then + aria2c -q -d $(dirname $path) -o $(basename $path) "$uri" + else + curl "$uri" -sSL --retry $download_retries --retry-delay $retry_wait_time_seconds --create-dirs -o "$path" --fail + fi + + return $? +} +mkdir -p /tmp/src + +cd /tmp/src + +echo "Installing cmake" +GetFile https://github.com/Kitware/CMake/releases/download/v3.29.3/cmake-3.29.3-linux-`uname -m`.tar.gz /tmp/src/cmake-3.29.3-linux-`uname -m`.tar.gz +tar -zxf /tmp/src/cmake-3.29.3-linux-`uname -m`.tar.gz --strip=1 -C /usr + +echo "Installing Ninja" +GetFile https://github.com/ninja-build/ninja/archive/v1.10.0.tar.gz /tmp/src/ninja-linux.tar.gz +tar -zxf ninja-linux.tar.gz +pushd ninja-1.10.0 +cmake -Bbuild-cmake -H. +cmake --build build-cmake +mv ./build-cmake/ninja /usr/bin +popd + +echo "Installing Node.js" +CPU_ARCH=`uname -m` +if [[ "$CPU_ARCH" = "x86_64" ]]; then + NODEJS_ARCH=x64 +elif [[ "$CPU_ARCH" = "aarch64" ]]; then + NODEJS_ARCH=arm64 +else + NODEJS_ARCH=$CPU_ARCH +fi +# The EOL for nodejs v18.17.1 LTS is April 2025 +GetFile https://nodejs.org/dist/v18.17.1/node-v18.17.1-linux-${NODEJS_ARCH}.tar.gz /tmp/src/node-v18.17.1-linux-${NODEJS_ARCH}.tar.gz +tar --strip 1 -xf /tmp/src/node-v18.17.1-linux-${NODEJS_ARCH}.tar.gz -C /usr + +cd / +rm -rf /tmp/src diff --git a/tools/ci_build/github/linux/docker/inference/x86_64/default/gpu/scripts/install_centos.sh b/tools/ci_build/github/linux/docker/inference/x86_64/default/gpu/scripts/install_centos.sh deleted file mode 100755 index 9647280da1aea..0000000000000 --- a/tools/ci_build/github/linux/docker/inference/x86_64/default/gpu/scripts/install_centos.sh +++ /dev/null @@ -1,9 +0,0 @@ -#!/bin/bash -set -e -x -if [ ! -f /etc/yum.repos.d/microsoft-prod.repo ]; then - os_major_version=$(tr -dc '0-9.' < /etc/redhat-release |cut -d \. -f1) - echo "installing for CentOS version : $os_major_version" - rpm -Uvh https://packages.microsoft.com/config/centos/$os_major_version/packages-microsoft-prod.rpm -fi -dnf install -y python39-devel glibc-langpack-\* glibc-locale-source which redhat-lsb-core expat-devel tar unzip zlib-devel make bzip2 bzip2-devel msopenjdk-11 -locale diff --git a/tools/ci_build/github/linux/docker/inference/x86_64/python/cpu/Dockerfile b/tools/ci_build/github/linux/docker/inference/x86_64/python/cpu/Dockerfile index 3cec4ed6e9dce..2f568a78a13dc 100644 --- a/tools/ci_build/github/linux/docker/inference/x86_64/python/cpu/Dockerfile +++ b/tools/ci_build/github/linux/docker/inference/x86_64/python/cpu/Dockerfile @@ -1,4 +1,4 @@ -FROM onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cpu_x64_ubi8_gcc12:20240530.3 +FROM onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cpu_x64_ubi8_gcc12:20240531.1 ADD scripts /tmp/scripts RUN cd /tmp/scripts && /tmp/scripts/install_centos.sh && /tmp/scripts/install_deps.sh && rm -rf /tmp/scripts diff --git a/tools/ci_build/github/linux/docker/inference/x86_64/python/cpu/scripts/requirements.txt b/tools/ci_build/github/linux/docker/inference/x86_64/python/cpu/scripts/requirements.txt index 8f56ee18ccd24..cc47718f78a46 100644 --- a/tools/ci_build/github/linux/docker/inference/x86_64/python/cpu/scripts/requirements.txt +++ b/tools/ci_build/github/linux/docker/inference/x86_64/python/cpu/scripts/requirements.txt @@ -5,7 +5,7 @@ mypy pytest setuptools>=68.2.2 wheel -onnx==1.16.0 +onnx==1.16.1 protobuf==4.21.12 sympy==1.12 flatbuffers diff --git a/tools/ci_build/github/linux/docker/inference/x86_64/python/cuda/scripts/requirements.txt b/tools/ci_build/github/linux/docker/inference/x86_64/python/cuda/scripts/requirements.txt index 8f56ee18ccd24..cc47718f78a46 100644 --- a/tools/ci_build/github/linux/docker/inference/x86_64/python/cuda/scripts/requirements.txt +++ b/tools/ci_build/github/linux/docker/inference/x86_64/python/cuda/scripts/requirements.txt @@ -5,7 +5,7 @@ mypy pytest setuptools>=68.2.2 wheel -onnx==1.16.0 +onnx==1.16.1 protobuf==4.21.12 sympy==1.12 flatbuffers diff --git a/tools/ci_build/github/linux/docker/scripts/manylinux/requirements.txt b/tools/ci_build/github/linux/docker/scripts/manylinux/requirements.txt index 80eccb68ebebb..bdae9d72a1a63 100644 --- a/tools/ci_build/github/linux/docker/scripts/manylinux/requirements.txt +++ b/tools/ci_build/github/linux/docker/scripts/manylinux/requirements.txt @@ -5,7 +5,7 @@ mypy pytest setuptools>=68.2.2 wheel -onnx==1.16.0 +onnx==1.16.1 protobuf==4.21.12 sympy==1.12 flatbuffers diff --git a/tools/ci_build/github/linux/docker/scripts/requirements.txt b/tools/ci_build/github/linux/docker/scripts/requirements.txt index e20e433cd33c6..3e619ea3dfb56 100644 --- a/tools/ci_build/github/linux/docker/scripts/requirements.txt +++ b/tools/ci_build/github/linux/docker/scripts/requirements.txt @@ -6,7 +6,7 @@ mypy pytest setuptools==69.0.3 wheel==0.42.0 -onnx==1.16.0 +onnx==1.16.1 argparse sympy==1.12 flatbuffers