From 0101ce8fa32856cfc62a2402f7a8f5c5938782d7 Mon Sep 17 00:00:00 2001 From: Yulong Wang <7679871+fs-eire@users.noreply.github.com> Date: Wed, 25 Sep 2024 16:38:02 -0700 Subject: [PATCH 1/6] remove toggle "disable_workgroup_init" --- onnxruntime/core/providers/webgpu/webgpu_context.cc | 1 - 1 file changed, 1 deletion(-) diff --git a/onnxruntime/core/providers/webgpu/webgpu_context.cc b/onnxruntime/core/providers/webgpu/webgpu_context.cc index f2414c14f6f9a..7dbccb532dd55 100644 --- a/onnxruntime/core/providers/webgpu/webgpu_context.cc +++ b/onnxruntime/core/providers/webgpu/webgpu_context.cc @@ -387,7 +387,6 @@ std::vector WebGpuContext::GetEnabledDeviceToggles() const { constexpr const char* toggles[] = { "skip_validation", // only use "skip_validation" when ValidationMode is set to "Disabled" "disable_robustness", - "disable_workgroup_init", "d3d_disable_ieee_strictness", }; return std::vector(ValidationMode() >= ValidationMode::WGPUOnly From 38967062675ef6906824f5317732b5630054220c Mon Sep 17 00:00:00 2001 From: Yulong Wang <7679871+fs-eire@users.noreply.github.com> Date: Wed, 25 Sep 2024 16:44:32 -0700 Subject: [PATCH 2/6] set backend type to D3D12 since we always uses dxc (win). --- onnxruntime/core/providers/webgpu/webgpu_context.cc | 3 +++ 1 file changed, 3 insertions(+) diff --git a/onnxruntime/core/providers/webgpu/webgpu_context.cc b/onnxruntime/core/providers/webgpu/webgpu_context.cc index 7dbccb532dd55..e9ae97369c6c3 100644 --- a/onnxruntime/core/providers/webgpu/webgpu_context.cc +++ b/onnxruntime/core/providers/webgpu/webgpu_context.cc @@ -33,6 +33,9 @@ void WebGpuContext::Initialize(const WebGpuExecutionProviderInfo& webgpu_ep_info wgpu::RequestAdapterOptions req_adapter_options = {}; wgpu::DawnTogglesDescriptor adapter_toggles_desc = {}; req_adapter_options.nextInChain = &adapter_toggles_desc; +#ifdef WIN32 + req_adapter_options.backendType = wgpu::BackendType::D3D12; +#endif auto enabled_adapter_toggles = GetEnabledAdapterToggles(); adapter_toggles_desc.enabledToggleCount = enabled_adapter_toggles.size(); From f02e85a3adefb62649e97c6fadd5851e6fa1ab2c Mon Sep 17 00:00:00 2001 From: Yulong Wang <7679871+fs-eire@users.noreply.github.com> Date: Wed, 25 Sep 2024 16:53:41 -0700 Subject: [PATCH 3/6] update build configurations to webgpu EP (#22047) ### Description --------- Co-authored-by: Scott McKay --- .../external/onnxruntime_external_deps.cmake | 82 ++++++++++++++----- cmake/onnxruntime.cmake | 66 +++++++++++++-- cmake/onnxruntime_providers_webgpu.cmake | 12 +-- cmake/patches/dawn/dawn.patch | 66 +++++++++++++++ .../webgpu/webgpu_provider_factory.h | 14 ++++ .../main/java/ai/onnxruntime/OrtProvider.java | 4 +- .../platform/apple/logging/apple_log_sink.mm | 2 - .../webgpu/math/unary_elementwise_ops.cc | 12 +++ .../core/providers/webgpu/shader_variable.h | 3 + .../core/providers/webgpu/tensor/where.cc | 6 +- .../core/providers/webgpu/webgpu_context.cc | 5 ++ .../ios_package_uitest_cpp_api.mm | 23 +++++- .../macos_package_uitest_cpp_api.mm | 24 +++++- .../default_full_aar_build_settings.json | 1 + .../apple/build_and_assemble_apple_pods.py | 2 + ...t_full_apple_framework_build_settings.json | 1 + ...ult_full_ios_framework_build_settings.json | 2 + .../templates/mac-cpu-packing-jobs.yml | 6 +- 18 files changed, 277 insertions(+), 54 deletions(-) create mode 100644 cmake/patches/dawn/dawn.patch create mode 100644 include/onnxruntime/core/providers/webgpu/webgpu_provider_factory.h diff --git a/cmake/external/onnxruntime_external_deps.cmake b/cmake/external/onnxruntime_external_deps.cmake index 370a2d5c72351..6f54ce1b4face 100644 --- a/cmake/external/onnxruntime_external_deps.cmake +++ b/cmake/external/onnxruntime_external_deps.cmake @@ -575,10 +575,11 @@ if (onnxruntime_USE_MIMALLOC) onnxruntime_fetchcontent_makeavailable(mimalloc) endif() -#onnxruntime_EXTERNAL_LIBRARIES could contain onnx, onnx_proto,libprotobuf, cuda/cudnn, -# dnnl/mklml, onnxruntime_codegen_tvm, tvm and pthread -# pthread is always at the last -set(onnxruntime_EXTERNAL_LIBRARIES ${onnxruntime_EXTERNAL_LIBRARIES_XNNPACK} ${WIL_TARGET} nlohmann_json::nlohmann_json onnx onnx_proto ${PROTOBUF_LIB} re2::re2 Boost::mp11 safeint_interface flatbuffers::flatbuffers ${GSL_TARGET} ${ABSEIL_LIBS} date::date ${ONNXRUNTIME_CLOG_TARGET_NAME}) +set(onnxruntime_EXTERNAL_LIBRARIES ${onnxruntime_EXTERNAL_LIBRARIES_XNNPACK} ${WIL_TARGET} nlohmann_json::nlohmann_json + onnx onnx_proto ${PROTOBUF_LIB} re2::re2 Boost::mp11 safeint_interface + flatbuffers::flatbuffers ${GSL_TARGET} ${ABSEIL_LIBS} date::date + ${ONNXRUNTIME_CLOG_TARGET_NAME}) + # The source code of onnx_proto is generated, we must build this lib first before starting to compile the other source code that uses ONNX protobuf types. # The other libs do not have the problem. All the sources are already there. We can compile them in any order. set(onnxruntime_EXTERNAL_DEPENDENCIES onnx_proto flatbuffers::flatbuffers) @@ -638,33 +639,70 @@ if (onnxruntime_USE_WEBGPU) dawn URL ${DEP_URL_dawn} URL_HASH SHA1=${DEP_SHA1_dawn} + PATCH_COMMAND ${Patch_EXECUTABLE} --binary --ignore-whitespace -p1 < ${PROJECT_SOURCE_DIR}/patches/dawn/dawn.patch ) - set(DAWN_FETCH_DEPENDENCIES ON) - set(DAWN_ENABLE_INSTALL ON) - set(TINT_BUILD_TESTS OFF) - set(DAWN_USE_BUILT_DXC ON) + + # use dawn::dawn_native and dawn::dawn_proc instead of the monolithic dawn::webgpu_dawn to minimize binary size + set(DAWN_BUILD_MONOLITHIC_LIBRARY OFF CACHE BOOL "" FORCE) + set(DAWN_BUILD_SAMPLES OFF CACHE BOOL "" FORCE) + set(DAWN_ENABLE_INSTALL OFF CACHE BOOL "" FORCE) + set(DAWN_ENABLE_NULL OFF CACHE BOOL "" FORCE) + set(DAWN_FETCH_DEPENDENCIES ON CACHE BOOL "" FORCE) + + # disable things we don't use set(DAWN_DXC_ENABLE_ASSERTS_IN_NDEBUG OFF) - onnxruntime_fetchcontent_makeavailable(dawn) -endif() + set(DAWN_ENABLE_DESKTOP_GL OFF CACHE BOOL "" FORCE) + set(DAWN_ENABLE_OPENGLES OFF CACHE BOOL "" FORCE) + set(DAWN_SUPPORTS_GLFW_FOR_WINDOWING OFF CACHE BOOL "" FORCE) + set(DAWN_USE_GLFW OFF CACHE BOOL "" FORCE) + set(DAWN_USE_WINDOWS_UI OFF CACHE BOOL "" FORCE) + set(DAWN_USE_X11 OFF CACHE BOOL "" FORCE) + + set(TINT_BUILD_TESTS OFF CACHE BOOL "" FORCE) + set(TINT_BUILD_CMD_TOOLS OFF CACHE BOOL "" FORCE) + set(TINT_BUILD_GLSL_WRITER OFF CACHE BOOL "" FORCE) + set(TINT_BUILD_GLSL_VALIDATOR OFF CACHE BOOL "" FORCE) + set(TINT_BUILD_IR_BINARY OFF CACHE BOOL "" FORCE) + set(TINT_BUILD_SPV_READER OFF CACHE BOOL "" FORCE) # don't need. disabling is a large binary size saving + set(TINT_BUILD_WGSL_WRITER ON CACHE BOOL "" FORCE) # needed to create cache key. runtime error if not enabled. + + # SPIR-V validation shouldn't be required given we're using Tint to create the SPIR-V. + if (NOT CMAKE_BUILD_TYPE STREQUAL "Debug") + set(DAWN_ENABLE_SPIRV_VALIDATION OFF CACHE BOOL "" FORCE) + endif() -message(STATUS "Finished fetching external dependencies") + if (WIN32) + # building this requires the HLSL writer to be enabled in Tint. TBD if that we need either of these to be ON. + set(DAWN_USE_BUILT_DXC ON CACHE BOOL "" FORCE) + set(TINT_BUILD_HLSL_WRITER ON CACHE BOOL "" FORCE) -set(onnxruntime_LINK_DIRS ) + # Vulkan may optionally be included in a Windows build. Exclude until we have an explicit use case that requires it. + set(DAWN_ENABLE_VULKAN OFF CACHE BOOL "" FORCE) + endif() + onnxruntime_fetchcontent_makeavailable(dawn) + + list(APPEND onnxruntime_EXTERNAL_LIBRARIES dawn::dawn_native dawn::dawn_proc) +endif() + +set(onnxruntime_LINK_DIRS) if (onnxruntime_USE_CUDA) - find_package(CUDAToolkit REQUIRED) + find_package(CUDAToolkit REQUIRED) - if(onnxruntime_CUDNN_HOME) - file(TO_CMAKE_PATH ${onnxruntime_CUDNN_HOME} onnxruntime_CUDNN_HOME) - set(CUDNN_PATH ${onnxruntime_CUDNN_HOME}) - endif() - include(cuDNN) + if(onnxruntime_CUDNN_HOME) + file(TO_CMAKE_PATH ${onnxruntime_CUDNN_HOME} onnxruntime_CUDNN_HOME) + set(CUDNN_PATH ${onnxruntime_CUDNN_HOME}) + endif() + + include(cuDNN) endif() if(onnxruntime_USE_SNPE) - include(external/find_snpe.cmake) - list(APPEND onnxruntime_EXTERNAL_LIBRARIES ${SNPE_NN_LIBS}) + include(external/find_snpe.cmake) + list(APPEND onnxruntime_EXTERNAL_LIBRARIES ${SNPE_NN_LIBS}) endif() -FILE(TO_NATIVE_PATH ${CMAKE_BINARY_DIR} ORT_BINARY_DIR) -FILE(TO_NATIVE_PATH ${PROJECT_SOURCE_DIR} ORT_SOURCE_DIR) +FILE(TO_NATIVE_PATH ${CMAKE_BINARY_DIR} ORT_BINARY_DIR) +FILE(TO_NATIVE_PATH ${PROJECT_SOURCE_DIR} ORT_SOURCE_DIR) + +message(STATUS "Finished fetching external dependencies") diff --git a/cmake/onnxruntime.cmake b/cmake/onnxruntime.cmake index 9b6acea876f95..b1d797ca16adc 100644 --- a/cmake/onnxruntime.cmake +++ b/cmake/onnxruntime.cmake @@ -89,10 +89,22 @@ elseif(onnxruntime_BUILD_APPLE_FRAMEWORK) # create Info.plist for the framework and podspec for CocoaPods (optional) set(MACOSX_FRAMEWORK_NAME "onnxruntime") set(MACOSX_FRAMEWORK_IDENTIFIER "com.microsoft.onnxruntime") - # Need to include CoreML as a weaklink for CocoaPods package if the EP is enabled + + # Setup weak frameworks for macOS/iOS. 'weak' as the CoreML or WebGPU EPs are optionally enabled. if(onnxruntime_USE_COREML) - set(APPLE_WEAK_FRAMEWORK "\\\"CoreML\\\"") + list(APPEND _weak_frameworks "\\\"CoreML\\\"") + endif() + + if(onnxruntime_USE_WEBGPU) + list(APPEND _weak_frameworks "\\\"QuartzCore\\\"") + list(APPEND _weak_frameworks "\\\"IOSurface\\\"") + list(APPEND _weak_frameworks "\\\"Metal\\\"") endif() + + if (_weak_frameworks) + string(JOIN ", " APPLE_WEAK_FRAMEWORK ${_weak_frameworks}) + endif() + set(INFO_PLIST_PATH "${CMAKE_CURRENT_BINARY_DIR}/Info.plist") configure_file(${REPO_ROOT}/cmake/Info.plist.in ${INFO_PLIST_PATH}) configure_file( @@ -364,16 +376,58 @@ if(onnxruntime_BUILD_APPLE_FRAMEWORK) endif() endforeach() + # helper function that recurses to also handle static library dependencies of the ORT external libraries + set(_processed_libs) # keep track of processed libraries to skip any duplicate dependencies + function(add_symlink_for_static_lib_and_dependencies lib) + function(process cur_target) + # de-alias if applicable so a consistent target name is used + get_target_property(alias ${cur_target} ALIASED_TARGET) + if(TARGET ${alias}) + set(cur_target ${alias}) + endif() + + if(${cur_target} IN_LIST _processed_libs OR ${cur_target} IN_LIST lib_and_dependencies) + return() + endif() + + list(APPEND lib_and_dependencies ${cur_target}) + + get_target_property(link_libraries ${cur_target} LINK_LIBRARIES) + foreach(dependency ${link_libraries}) + if(TARGET ${dependency}) + process(${dependency}) + endif() + endforeach() + + set(lib_and_dependencies ${lib_and_dependencies} PARENT_SCOPE) + endfunction() + + set(lib_and_dependencies) + process(${lib}) + + foreach(_target ${lib_and_dependencies}) + get_target_property(type ${_target} TYPE) + if(${type} STREQUAL "STATIC_LIBRARY") + # message(STATUS "Adding symlink for ${_target}") + add_custom_command(TARGET onnxruntime POST_BUILD + COMMAND ${CMAKE_COMMAND} -E create_symlink + $ ${STATIC_LIB_DIR}/$) + endif() + endforeach() + + list(APPEND _processed_libs ${lib_and_dependencies}) + set(_processed_libs ${_processed_libs} PARENT_SCOPE) + endfunction() + # for external libraries we create a symlink to the .a file foreach(_LIB ${onnxruntime_EXTERNAL_LIBRARIES}) - if(NOT TARGET ${_LIB}) # if we didn't build from source. it may not a target + if(NOT TARGET ${_LIB}) # if we didn't build from source it may not be a target continue() endif() + GET_TARGET_PROPERTY(_LIB_TYPE ${_LIB} TYPE) if(_LIB_TYPE STREQUAL "STATIC_LIBRARY") - add_custom_command(TARGET onnxruntime POST_BUILD - COMMAND ${CMAKE_COMMAND} -E create_symlink - $ ${STATIC_LIB_DIR}/$) + add_symlink_for_static_lib_and_dependencies(${_LIB}) endif() endforeach() diff --git a/cmake/onnxruntime_providers_webgpu.cmake b/cmake/onnxruntime_providers_webgpu.cmake index 587c4b2c1ff2c..8d00ab5aa4494 100644 --- a/cmake/onnxruntime_providers_webgpu.cmake +++ b/cmake/onnxruntime_providers_webgpu.cmake @@ -24,14 +24,8 @@ source_group(TREE ${REPO_ROOT} FILES ${onnxruntime_providers_webgpu_cc_srcs}) onnxruntime_add_static_library(onnxruntime_providers_webgpu ${onnxruntime_providers_webgpu_cc_srcs}) - onnxruntime_add_include_to_target(onnxruntime_providers_webgpu onnxruntime_common onnx onnx_proto flatbuffers::flatbuffers Boost::mp11 safeint_interface) - target_link_libraries(onnxruntime_providers_webgpu dawn::webgpu_dawn) - - # Copy webgpu_dawn.dll to the output directory - add_custom_command( - TARGET onnxruntime_providers_webgpu - POST_BUILD - COMMAND ${CMAKE_COMMAND} -E copy_if_different "$" "$" - VERBATIM ) + onnxruntime_add_include_to_target(onnxruntime_providers_webgpu + onnxruntime_common dawn::dawncpp_headers dawn::dawn_headers onnx onnx_proto flatbuffers::flatbuffers Boost::mp11 safeint_interface) + target_link_libraries(onnxruntime_providers_webgpu dawn::dawn_native dawn::dawn_proc) set_target_properties(onnxruntime_providers_webgpu PROPERTIES FOLDER "ONNXRuntime") diff --git a/cmake/patches/dawn/dawn.patch b/cmake/patches/dawn/dawn.patch new file mode 100644 index 0000000000000..d696d386452e8 --- /dev/null +++ b/cmake/patches/dawn/dawn.patch @@ -0,0 +1,66 @@ +diff --git a/src/dawn/native/CMakeLists.txt b/src/dawn/native/CMakeLists.txt +index 9c0bd6fa4e..bf8a57aeac 100644 +--- a/src/dawn/native/CMakeLists.txt ++++ b/src/dawn/native/CMakeLists.txt +@@ -857,6 +857,11 @@ if (DAWN_ENABLE_SWIFTSHADER) + target_compile_definitions(dawn_native PRIVATE "DAWN_ENABLE_SWIFTSHADER") + endif() + ++if (IOS) ++ target_compile_options(dawn_native_objects PRIVATE -fno-objc-arc) ++ target_compile_options(dawn_native PRIVATE -fno-objc-arc) ++endif() ++ + if (DAWN_BUILD_MONOLITHIC_LIBRARY) + ############################################################################### + # Do the 'complete_lib' build. +diff --git a/src/dawn/native/Surface_metal.mm b/src/dawn/native/Surface_metal.mm +index ce55acbd43..baa4835362 100644 +--- a/src/dawn/native/Surface_metal.mm ++++ b/src/dawn/native/Surface_metal.mm +@@ -36,7 +36,13 @@ + namespace dawn::native { + + bool InheritsFromCAMetalLayer(void* obj) { +- id object = static_cast(obj); ++ id object = ++#if TARGET_OS_IOS ++ (__bridge id)obj; ++#else ++ static_cast(obj); ++#endif ++ + return [object isKindOfClass:[CAMetalLayer class]]; + } + +diff --git a/src/dawn/native/metal/SharedFenceMTL.mm b/src/dawn/native/metal/SharedFenceMTL.mm +index bde8bfea07..f2f6459e91 100644 +--- a/src/dawn/native/metal/SharedFenceMTL.mm ++++ b/src/dawn/native/metal/SharedFenceMTL.mm +@@ -40,7 +40,13 @@ ResultOrError> SharedFence::Create( + DAWN_INVALID_IF(descriptor->sharedEvent == nullptr, "MTLSharedEvent is missing."); + if (@available(macOS 10.14, iOS 12.0, *)) { + return AcquireRef(new SharedFence( +- device, label, static_cast>(descriptor->sharedEvent))); ++ device, label, ++#if TARGET_OS_IOS ++ (__bridge id)(descriptor->sharedEvent) ++#else ++ static_cast>(descriptor->sharedEvent) ++#endif ++ )); + } else { + return DAWN_INTERNAL_ERROR("MTLSharedEvent not supported."); + } +diff --git a/src/tint/api/BUILD.cmake b/src/tint/api/BUILD.cmake +index 0037d83276..6372c4ee77 100644 +--- a/src/tint/api/BUILD.cmake ++++ b/src/tint/api/BUILD.cmake +@@ -57,6 +57,7 @@ tint_target_add_dependencies(tint_api lib + tint_lang_wgsl_ast_transform + tint_lang_wgsl_common + tint_lang_wgsl_features ++ tint_lang_wgsl_inspector + tint_lang_wgsl_program + tint_lang_wgsl_sem + tint_lang_wgsl_writer_ir_to_program diff --git a/include/onnxruntime/core/providers/webgpu/webgpu_provider_factory.h b/include/onnxruntime/core/providers/webgpu/webgpu_provider_factory.h new file mode 100644 index 0000000000000..0b45b847d651f --- /dev/null +++ b/include/onnxruntime/core/providers/webgpu/webgpu_provider_factory.h @@ -0,0 +1,14 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +// Dummy file to provide a signal in the ONNX Runtime C cocoapod as to whether the WebGPU EP was included in the build. +// If it was, this file will be included in the cocoapod, and a test like this can be used: +// +// #if __has_include() +// #define WEBGPU_EP_AVAILABLE 1 +// #else +// #define WEBGPU_EP_AVAILABLE 0 +// #endif + +// The WebGPU EP can be enabled via the generic SessionOptionsAppendExecutionProvider method, so no direct usage of +// the provider factory is required. diff --git a/java/src/main/java/ai/onnxruntime/OrtProvider.java b/java/src/main/java/ai/onnxruntime/OrtProvider.java index ae9cb9f908629..b06f884896ee8 100644 --- a/java/src/main/java/ai/onnxruntime/OrtProvider.java +++ b/java/src/main/java/ai/onnxruntime/OrtProvider.java @@ -40,7 +40,9 @@ public enum OrtProvider { /** The XNNPACK execution provider. */ XNNPACK("XnnpackExecutionProvider"), /** The Azure remote endpoint execution provider. */ - AZURE("AzureExecutionProvider"); + AZURE("AzureExecutionProvider"), + /** The WebGPU execution provider */ + WEBGPU("WebGpuExecutionProvider"); private static final Map valueMap = new HashMap<>(values().length); diff --git a/onnxruntime/core/platform/apple/logging/apple_log_sink.mm b/onnxruntime/core/platform/apple/logging/apple_log_sink.mm index 00e691a8f9fd3..6abbe76a7f151 100644 --- a/onnxruntime/core/platform/apple/logging/apple_log_sink.mm +++ b/onnxruntime/core/platform/apple/logging/apple_log_sink.mm @@ -7,8 +7,6 @@ #include -#include "date/date.h" - namespace onnxruntime { namespace logging { diff --git a/onnxruntime/core/providers/webgpu/math/unary_elementwise_ops.cc b/onnxruntime/core/providers/webgpu/math/unary_elementwise_ops.cc index 3b43c87fb0c82..9e8117aa34a92 100644 --- a/onnxruntime/core/providers/webgpu/math/unary_elementwise_ops.cc +++ b/onnxruntime/core/providers/webgpu/math/unary_elementwise_ops.cc @@ -165,7 +165,19 @@ WEBGPU_ELEMENTWISE_KERNEL(Asinh, 9, WebGpuSupportedFloatTypes()) WEBGPU_ELEMENTWISE_IMPL(Acosh, "acosh(a)") WEBGPU_ELEMENTWISE_KERNEL(Acosh, 9, WebGpuSupportedFloatTypes()) +#if __APPLE__ +// Metal returns 0 for values >= 1.0. +// Need custom impl to return +inf for 1.0 (by dividing 1 by 0), and NaN for > 1.0 (by dividing 0 by 0) +WEBGPU_ELEMENTWISE_IMPL(Atanh, + "select(" + " select(x_value_t(1.0), x_value_t(0.0), a > x_value_t(1.0)) / x_value_t(0.0)," + " atanh(a)," + " a < x_value_t(1.0))", + "", + ShaderUsage::UseValueTypeAlias) +#else WEBGPU_ELEMENTWISE_IMPL(Atanh, "atanh(a)") +#endif WEBGPU_ELEMENTWISE_KERNEL(Atanh, 9, WebGpuSupportedFloatTypes()) WEBGPU_ELEMENTWISE_IMPL(Not, "!a") diff --git a/onnxruntime/core/providers/webgpu/shader_variable.h b/onnxruntime/core/providers/webgpu/shader_variable.h index 2ddc9a6e8160f..72f38aecb99ce 100644 --- a/onnxruntime/core/providers/webgpu/shader_variable.h +++ b/onnxruntime/core/providers/webgpu/shader_variable.h @@ -67,6 +67,9 @@ class ShaderIndicesHelper { public: ShaderIndicesHelper(std::string_view name, ProgramVariableDataType type, ShaderUsage usage, const TensorShape& dims); + ShaderIndicesHelper(ShaderIndicesHelper&&) = default; + ShaderIndicesHelper& operator=(ShaderIndicesHelper&&) = default; + // get the number of components of the variable. inline int NumComponents() const { return num_components_; } diff --git a/onnxruntime/core/providers/webgpu/tensor/where.cc b/onnxruntime/core/providers/webgpu/tensor/where.cc index 31806a0af1741..1d58538a7489c 100644 --- a/onnxruntime/core/providers/webgpu/tensor/where.cc +++ b/onnxruntime/core/providers/webgpu/tensor/where.cc @@ -59,7 +59,7 @@ Status WhereProgram::GenerateShaderCode(ShaderHelper& shader) const { const auto& b_input = shader.AddInput("b_data", ShaderUsage::UseUniform); const auto& output = shader.AddOutput("output_data", ShaderUsage::UseUniform); - auto expression = [](const std::string& a, const std::string& b, const std::string& c) -> const auto { + const auto expression = [](const std::string& a, const std::string& b, const std::string& c) -> auto { return "select(" + b + ", " + a + ", " + c + ")"; }; std::string assignment; @@ -74,10 +74,10 @@ Status WhereProgram::GenerateShaderCode(ShaderHelper& shader) const { const auto& b_indices = shader.AddIndices("b_indices"); const auto& output_indices = shader.AddIndices("output_indices"); - auto single_assignment = + const auto single_assignment = [&expression, &output_indices, &a_indices, &b_indices, &c_indices]( const std::string& rest_str, const std::string& x, const std::string& type_cast = "") - -> const auto { + -> auto { const std::string a_expression = "a_data[index_a" + x + "][component_a" + x + "]"; const std::string b_expression = "b_data[index_b" + x + "][component_b" + x + "]"; const std::string c_expression = "bool(c_data[index_c" + x + "] & (0xffu << (component_c" + x + " * 8)))"; diff --git a/onnxruntime/core/providers/webgpu/webgpu_context.cc b/onnxruntime/core/providers/webgpu/webgpu_context.cc index e9ae97369c6c3..bb4ae4f6dcce5 100644 --- a/onnxruntime/core/providers/webgpu/webgpu_context.cc +++ b/onnxruntime/core/providers/webgpu/webgpu_context.cc @@ -4,6 +4,9 @@ #include #include +#include "dawn/dawn_proc.h" +#include "dawn/native/DawnNative.h" + #include "core/common/common.h" #include "core/providers/webgpu/compute_context.h" @@ -21,6 +24,8 @@ void WebGpuContext::Initialize(const WebGpuExecutionProviderInfo& webgpu_ep_info std::call_once(init_flag_, [this, &webgpu_ep_info]() { // Initialization.Step.1 - Create wgpu::Instance if (instance_ == nullptr) { + dawnProcSetProcs(&dawn::native::GetProcs()); + wgpu::InstanceDescriptor instance_desc{}; instance_desc.features.timedWaitAnyEnable = true; instance_ = wgpu::CreateInstance(&instance_desc); diff --git a/onnxruntime/test/platform/apple/apple_package_test/ios_package_testUITests/ios_package_uitest_cpp_api.mm b/onnxruntime/test/platform/apple/apple_package_test/ios_package_testUITests/ios_package_uitest_cpp_api.mm index d145a00b1348f..32b4b32e299d6 100644 --- a/onnxruntime/test/platform/apple/apple_package_test/ios_package_testUITests/ios_package_uitest_cpp_api.mm +++ b/onnxruntime/test/platform/apple/apple_package_test/ios_package_testUITests/ios_package_uitest_cpp_api.mm @@ -13,15 +13,19 @@ #if __has_include() #define COREML_EP_AVAILABLE 1 +#include #else #define COREML_EP_AVAILABLE 0 #endif -#if COREML_EP_AVAILABLE -#include +#if __has_include() +#define WEBGPU_EP_AVAILABLE 1 +// WebGPU EP doesn't require including the header as it's enabled via AppendExecutionProvider +#else +#define WEBGPU_EP_AVAILABLE 0 #endif -void testSigmoid(const char* modelPath, bool useCoreML) { +void testSigmoid(const char* modelPath, bool useCoreML = false, bool useWebGPU = false) { // This is an e2e test for ORT C++ API Ort::Env env(ORT_LOGGING_LEVEL_WARNING, "testCppAPI"); @@ -38,6 +42,12 @@ void testSigmoid(const char* modelPath, bool useCoreML) { (void)useCoreML; #endif + if (useWebGPU) { + std::unordered_map provider_options; + // set provider options if needed. e.g. deviceId + session_options.AppendExecutionProvider("WebGPU", provider_options); + } + Ort::Session session(env, modelPath, session_options); size_t input_tensor_size = 3 * 4 * 5; @@ -96,7 +106,7 @@ - (NSString*)getFilePath { } - (void)testCppAPI_Basic { - testSigmoid([self getFilePath].UTF8String, false /* useCoreML */); + testSigmoid([self getFilePath].UTF8String); } #if COREML_EP_AVAILABLE @@ -105,4 +115,9 @@ - (void)testCppAPI_Basic_CoreML { } #endif +#if WEBGPU_EP_AVAILABLE +- (void)testCppAPI_Basic_WebGPU { + testSigmoid([self getFilePath].UTF8String, false /* useCoreML */, true /* useWebGPU */); +} +#endif @end diff --git a/onnxruntime/test/platform/apple/apple_package_test/macos_package_testUITests/macos_package_uitest_cpp_api.mm b/onnxruntime/test/platform/apple/apple_package_test/macos_package_testUITests/macos_package_uitest_cpp_api.mm index 613c6e545939f..86001b6cb50a5 100644 --- a/onnxruntime/test/platform/apple/apple_package_test/macos_package_testUITests/macos_package_uitest_cpp_api.mm +++ b/onnxruntime/test/platform/apple/apple_package_test/macos_package_testUITests/macos_package_uitest_cpp_api.mm @@ -13,15 +13,19 @@ #if __has_include() #define COREML_EP_AVAILABLE 1 +#include #else #define COREML_EP_AVAILABLE 0 #endif -#if COREML_EP_AVAILABLE -#include +#if __has_include() +#define WEBGPU_EP_AVAILABLE 1 +// WebGPU EP doesn't require including the header as it's enabled via AppendExecutionProvider +#else +#define WEBGPU_EP_AVAILABLE 0 #endif -void testSigmoid(const char* modelPath, bool useCoreML) { +void testSigmoid(const char* modelPath, bool useCoreML = false, bool useWebGPU = false) { // This is an e2e test for ORT C++ API Ort::Env env(ORT_LOGGING_LEVEL_WARNING, "testCppAPI"); @@ -38,6 +42,12 @@ void testSigmoid(const char* modelPath, bool useCoreML) { (void)useCoreML; #endif + if (useWebGPU) { + std::unordered_map provider_options; + // set provider options if needed. e.g. deviceId + session_options.AppendExecutionProvider("WebGPU", provider_options); + } + Ort::Session session(env, modelPath, session_options); size_t input_tensor_size = 3 * 4 * 5; @@ -96,7 +106,7 @@ - (NSString*)getFilePath { } - (void)testCppAPI_Basic { - testSigmoid([self getFilePath].UTF8String, false /* useCoreML */); + testSigmoid([self getFilePath].UTF8String); } #if COREML_EP_AVAILABLE @@ -105,4 +115,10 @@ - (void)testCppAPI_Basic_CoreML { } #endif +#if WEBGPU_EP_AVAILABLE +- (void)testCppAPI_Basic_WebGPU { + testSigmoid([self getFilePath].UTF8String, false /* useCoreML */, true /* useWebGPU */); +} +#endif + @end diff --git a/tools/ci_build/github/android/default_full_aar_build_settings.json b/tools/ci_build/github/android/default_full_aar_build_settings.json index b0eff75812673..f08f246748a5a 100644 --- a/tools/ci_build/github/android/default_full_aar_build_settings.json +++ b/tools/ci_build/github/android/default_full_aar_build_settings.json @@ -16,6 +16,7 @@ "--build_shared_lib", "--use_nnapi", "--use_xnnpack", + "--use_webgpu", "--skip_tests" ] } diff --git a/tools/ci_build/github/apple/build_and_assemble_apple_pods.py b/tools/ci_build/github/apple/build_and_assemble_apple_pods.py index 71aeb9e7b0304..dd037c17ae3b3 100755 --- a/tools/ci_build/github/apple/build_and_assemble_apple_pods.py +++ b/tools/ci_build/github/apple/build_and_assemble_apple_pods.py @@ -133,6 +133,8 @@ def main(): str(build_dir / "framework_out"), "--variant", package_variant.name, + "--test_project_stage_dir", # use a specific directory so it's easier to debug + str(build_dir / "test_apple_packages_staging"), ] run(test_apple_packages_args) diff --git a/tools/ci_build/github/apple/default_full_apple_framework_build_settings.json b/tools/ci_build/github/apple/default_full_apple_framework_build_settings.json index 84d7e355ed5b4..6175ac3a0ad58 100644 --- a/tools/ci_build/github/apple/default_full_apple_framework_build_settings.json +++ b/tools/ci_build/github/apple/default_full_apple_framework_build_settings.json @@ -19,6 +19,7 @@ "--build_apple_framework", "--use_coreml", "--use_xnnpack", + "--use_webgpu", "--skip_tests", "--cmake_extra_defines=onnxruntime_BUILD_UNIT_TESTS=OFF" ], diff --git a/tools/ci_build/github/apple/default_full_ios_framework_build_settings.json b/tools/ci_build/github/apple/default_full_ios_framework_build_settings.json index e2d8f70c02cf3..4c2c9442ab217 100644 --- a/tools/ci_build/github/apple/default_full_ios_framework_build_settings.json +++ b/tools/ci_build/github/apple/default_full_ios_framework_build_settings.json @@ -24,12 +24,14 @@ "--ios", "--use_xcode", "--use_xnnpack", + "--use_webgpu", "--apple_deploy_target=13.0" ], "iphonesimulator": [ "--ios", "--use_xcode", "--use_xnnpack", + "--use_webgpu", "--apple_deploy_target=13.0" ], "macabi":[ diff --git a/tools/ci_build/github/azure-pipelines/templates/mac-cpu-packing-jobs.yml b/tools/ci_build/github/azure-pipelines/templates/mac-cpu-packing-jobs.yml index 3b661d9eb2dc6..c2b140652a2dd 100644 --- a/tools/ci_build/github/azure-pipelines/templates/mac-cpu-packing-jobs.yml +++ b/tools/ci_build/github/azure-pipelines/templates/mac-cpu-packing-jobs.yml @@ -96,7 +96,7 @@ jobs: - template: mac-cpu-packaging-steps.yml parameters: MacosArch: ${{ parameters.MacosArch }} - AdditionalBuildFlags: ${{ parameters.AdditionalBuildFlags }} --use_coreml --cmake_extra_defines CMAKE_OSX_ARCHITECTURES="arm64;x86_64" + AdditionalBuildFlags: ${{ parameters.AdditionalBuildFlags }} --use_coreml --use_webgpu --cmake_extra_defines CMAKE_OSX_ARCHITECTURES="arm64;x86_64" BuildJava: false BuildNodejs: false WithCache: ${{ parameters.WithCache }} @@ -108,7 +108,7 @@ jobs: - template: mac-cpu-packaging-steps.yml parameters: MacosArch: ${{ parameters.MacosArch }} - AdditionalBuildFlags: ${{ parameters.AdditionalBuildFlags }} --build_nodejs --build_java --use_coreml --cmake_extra_defines CMAKE_OSX_ARCHITECTURES=arm64 + AdditionalBuildFlags: ${{ parameters.AdditionalBuildFlags }} --build_nodejs --build_java --use_coreml --use_webgpu --cmake_extra_defines CMAKE_OSX_ARCHITECTURES=arm64 BuildJava: true BuildNodejs: true WithCache: ${{ parameters.WithCache }} @@ -120,7 +120,7 @@ jobs: - template: mac-cpu-packaging-steps.yml parameters: MacosArch: ${{ parameters.MacosArch }} - AdditionalBuildFlags: ${{ parameters.AdditionalBuildFlags }} --build_nodejs --build_java --use_coreml + AdditionalBuildFlags: ${{ parameters.AdditionalBuildFlags }} --build_nodejs --build_java --use_coreml --use_webgpu BuildJava: true BuildNodejs: true WithCache: ${{ parameters.WithCache }} From e5233ce865bd70d64830698b93e587888ec459c1 Mon Sep 17 00:00:00 2001 From: Yulong Wang <7679871+fs-eire@users.noreply.github.com> Date: Wed, 25 Sep 2024 17:01:49 -0700 Subject: [PATCH 4/6] enable build pipeline on Windows for WebGPU --- .../win-gpu-webgpu-ci-pipeline.yml | 58 +++++++++++++++++++ tools/ci_build/set-trigger-rules.py | 1 + 2 files changed, 59 insertions(+) create mode 100644 tools/ci_build/github/azure-pipelines/win-gpu-webgpu-ci-pipeline.yml diff --git a/tools/ci_build/github/azure-pipelines/win-gpu-webgpu-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/win-gpu-webgpu-ci-pipeline.yml new file mode 100644 index 0000000000000..c4db7735aaf2f --- /dev/null +++ b/tools/ci_build/github/azure-pipelines/win-gpu-webgpu-ci-pipeline.yml @@ -0,0 +1,58 @@ +##### start trigger Don't edit it manually, Please do edit set-trigger-rules.py #### +### please do rerun set-trigger-rules.py ### +trigger: + branches: + include: + - main + - rel-* + paths: + exclude: + - docs/** + - README.md + - CONTRIBUTING.md + - BUILD.md + - 'js/web' + - 'onnxruntime/core/providers/js' +pr: + branches: + include: + - main + - rel-* + paths: + exclude: + - docs/** + - README.md + - CONTRIBUTING.md + - BUILD.md + - 'js/web' + - 'onnxruntime/core/providers/js' +#### end trigger #### + +parameters: +- name: RunOnnxRuntimeTests + displayName: Run Tests? + type: boolean + default: true + +stages: +- stage: webgpu + dependsOn: [] + jobs: + - template: templates/jobs/win-ci-vs-2022-job.yml + parameters: + BuildConfig: 'RelWithDebInfo' + EnvSetupScript: setup_env_cuda.bat + buildArch: x64 + # add --enable_pybind and --build_java if necessary + additionalBuildFlags: >- + --build_nodejs + --use_webgpu + --cmake_extra_defines onnxruntime_BUILD_UNIT_TESTS=ON + msbuildPlatform: x64 + isX86: false + job_name_suffix: x64_RelWithDebInfo + RunOnnxRuntimeTests: ${{ parameters.RunOnnxRuntimeTests }} + ORT_EP_NAME: WebGPU + EnablePython: false + WITH_CACHE: true + MachinePool: onnxruntime-Win2022-VS2022-webgpu-A10 diff --git a/tools/ci_build/set-trigger-rules.py b/tools/ci_build/set-trigger-rules.py index fb6aa44cdf31a..0e9cd514d8aa5 100644 --- a/tools/ci_build/set-trigger-rules.py +++ b/tools/ci_build/set-trigger-rules.py @@ -40,6 +40,7 @@ "win-gpu-training-ci-pipeline.yml", "win-gpu-doc-gen-ci-pipeline.yml", "win-gpu-tensorrt-ci-pipeline.yml", + "win-gpu-webgpu-ci-pipeline.yml", "win-qnn-arm64-ci-pipeline.yml", "win-qnn-ci-pipeline.yml", ] From 0f7a5f6077f0885aa32b0ede324023419badb3c2 Mon Sep 17 00:00:00 2001 From: Xu Xing Date: Fri, 27 Sep 2024 13:49:09 +0800 Subject: [PATCH 5/6] [webgpu native] Add RotaryEmbedding op (#22194) ### Description ### Motivation and Context --- .../webgpu/bert/rotary_embedding.cc | 134 ++++++++++++++++++ .../webgpu/bert/rotary_embedding.h | 47 ++++++ .../webgpu/webgpu_contrib_kernels.cc | 2 +- .../contrib_ops/rotary_embedding_op_test.cc | 4 + 4 files changed, 186 insertions(+), 1 deletion(-) create mode 100644 onnxruntime/contrib_ops/webgpu/bert/rotary_embedding.cc create mode 100644 onnxruntime/contrib_ops/webgpu/bert/rotary_embedding.h diff --git a/onnxruntime/contrib_ops/webgpu/bert/rotary_embedding.cc b/onnxruntime/contrib_ops/webgpu/bert/rotary_embedding.cc new file mode 100644 index 0000000000000..eb5cfad87597f --- /dev/null +++ b/onnxruntime/contrib_ops/webgpu/bert/rotary_embedding.cc @@ -0,0 +1,134 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include "core/providers/webgpu/shader_helper.h" +#include "core/providers/webgpu/webgpu_supported_types.h" +#include "contrib_ops/webgpu/webgpu_contrib_kernels.h" +#include "contrib_ops/webgpu/bert/rotary_embedding.h" + +namespace onnxruntime { +namespace contrib { +namespace webgpu { + +ONNX_OPERATOR_KERNEL_EX( + RotaryEmbedding, + kMSDomain, + 1, + kWebGpuExecutionProvider, + (*KernelDefBuilder::Create()) + .TypeConstraint("T", WebGpuSupportedFloatTypes()) + .TypeConstraint("M", DataTypeImpl::GetTensorType()), + RotaryEmbedding); + +Status RotaryEmbeddingProgram::GenerateShaderCode(ShaderHelper& shader) const { + const auto& input = shader.AddInput("input", ShaderUsage::UseUniform); + const auto& position_ids = shader.AddInput("position_ids", ShaderUsage::UseUniform); + const auto& cos_cache = shader.AddInput("cos_cache", ShaderUsage::UseUniform); + const auto& sin_cache = shader.AddInput("sin_cache", ShaderUsage::UseUniform); + const auto& output = shader.AddOutput("output", ShaderUsage::UseUniform); + // TODO: remove output_indices. + const auto& output_indices = shader.AddIndices("output_indices", false); + const auto interleaved_str = interleaved_ ? "true" : "false"; + shader.SetMainFunctionBody( + " let half_rotary_emb_dim = uniforms.cos_cache_shape[1];\n" + " let bsnh = global_idx / uniforms.global_stride % uniforms.global_shape;\n" + " let size = uniforms.global_shape[0] * uniforms.global_stride[0];\n", + " if (global_idx >= size) { return; }\n" + " if (bsnh[3] < half_rotary_emb_dim) {\n" + " let position_ids_idx = " + + position_ids.BroadcastedIndicesToOffset("bsnh.xy", output_indices) + ";\n" + + " let position_id = u32(" + + position_ids.GetByOffset("position_ids_idx") + ")" + + " + select(0, bsnh[1], position_ids_idx == 0);\n" + " let i = dot(bsnh, uniforms.input_output_stride) + select(0, bsnh[3], " + + interleaved_str + + ");\n" + " let j = i + select(half_rotary_emb_dim, 1, " + + interleaved_str + + ");\n" + " let re = " + + input.GetByOffset("i") + " * " + cos_cache.GetByIndices("vec2(position_id, bsnh[3])") + "-" + + input.GetByOffset("j") + " * " + sin_cache.GetByIndices("vec2(position_id, bsnh[3])") + ";\n" + + " " + output.SetByOffset("i", "re") + "\n" + + " let im = " + input.GetByOffset("i") + " * " + + sin_cache.GetByIndices("vec2(position_id, bsnh[3])") + + "+ " + input.GetByOffset("j") + + " * " + cos_cache.GetByIndices("vec2(position_id, bsnh[3])") + + ";\n " + output.SetByOffset("j", "im") + + "\n" + " } else { \n" + " let k = dot(bsnh, uniforms.input_output_stride) + half_rotary_emb_dim;\n" + + " " + output.SetByOffset("k", input.GetByOffset("k")) + + "\n" + " }"); + + return Status::OK(); +} + +RotaryEmbedding::RotaryEmbedding(const OpKernelInfo& info) : WebGpuKernel(info) { + scale_ = info.GetAttrOrDefault("scale", 1.0); + rotary_embedding_dim_ = static_cast(info.GetAttrOrDefault("rotary_embedding_dim", 0)); + num_heads_ = static_cast(info.GetAttrOrDefault("num_heads", 0)); + interleaved_ = (info.GetAttrOrDefault("interleaved", 0) == 1); + is_packed_batching_ = (info.GetAttrOrDefault("is_packed_batching", 0) == 1); +} + +Status RotaryEmbedding::ComputeInternal(onnxruntime::webgpu::ComputeContext& context) const { + const auto* input = context.Input(0); + const auto input_shape = input->Shape(); + const auto* position_ids = context.Input(1); + const auto* cos_cache = context.Input(2); + const auto* sin_cache = context.Input(3); + auto* output = context.Output(0, input_shape); + + const auto batch_size = gsl::narrow_cast(input->Shape()[0]); + const auto batch_stride = gsl::narrow_cast(input_shape.SizeFromDimension(1)); + const auto sequence_length = gsl::narrow_cast(input_shape[input_shape.NumDimensions() - 2]); + const auto hidden_size = batch_stride / sequence_length; + const auto half_rotary_embedding_dim = gsl::narrow_cast(cos_cache->Shape()[1]); + const auto head_size = rotary_embedding_dim_ == 0 ? half_rotary_embedding_dim * 2 : hidden_size / num_heads_; + + // Rotary embeddings will be calculated in a pair-wise fashion. In accordance, use the shape + // [batch size, sequence length, num of heads, num of pairs to rotate + num of dims to copy] + // to unfold the global index in shader. + const TensorShape global_shape({batch_size, + sequence_length, + hidden_size / head_size, + head_size - half_rotary_embedding_dim}); + + const auto rank = global_shape.NumDimensions(); + std::vector global_dims(rank); + std::vector global_strides(rank); + for (size_t j = 0; j < rank; ++j) { + global_dims[j] = gsl::narrow_cast(global_shape[j]); + global_strides[j] = gsl::narrow_cast(global_shape.SizeFromDimension(j + 1)); + } + + const auto output_size = gsl::narrow_cast(global_shape.Size()); + RotaryEmbeddingProgram program{interleaved_}; + const auto input_output_strides = + input_shape.NumDimensions() == 3 + ? std::vector({batch_stride, hidden_size, head_size, 1}) + : (input_shape.NumDimensions() == 4 + ? std::vector({batch_stride, head_size, sequence_length * head_size, 1}) + : std::vector({})); + + program + .CacheHint(interleaved_) + .AddInputs({{input, ProgramTensorMetadataDependency::Rank}, + {position_ids, ProgramTensorMetadataDependency::Rank}, + {cos_cache, ProgramTensorMetadataDependency::Rank}, + {sin_cache, ProgramTensorMetadataDependency::Rank}}) + .AddOutput({output, ProgramTensorMetadataDependency::None}) + .SetDispatchGroupSize((output_size + WORKGROUP_SIZE - 1) / WORKGROUP_SIZE) + .AddUniformVariables({{scale_}, + {gsl::make_span(global_dims)}, + {gsl::make_span(global_strides)}, + {gsl::make_span(input_output_strides)}}) + .AddIndices(TensorShape{1, 1}); + return context.RunProgram(program); +} + +} // namespace webgpu +} // namespace contrib +} // namespace onnxruntime diff --git a/onnxruntime/contrib_ops/webgpu/bert/rotary_embedding.h b/onnxruntime/contrib_ops/webgpu/bert/rotary_embedding.h new file mode 100644 index 0000000000000..0d73b89fb62df --- /dev/null +++ b/onnxruntime/contrib_ops/webgpu/bert/rotary_embedding.h @@ -0,0 +1,47 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#pragma once + +#include "core/providers/webgpu/program.h" +#include "core/providers/webgpu/webgpu_kernel.h" + +namespace onnxruntime { +namespace contrib { +namespace webgpu { + +using namespace onnxruntime::webgpu; +using onnxruntime::webgpu::ComputeContext; + +class RotaryEmbeddingProgram final : public Program { + public: + RotaryEmbeddingProgram(bool interleaved) : Program{"RotaryEmbedding"}, interleaved_{interleaved} { + } + + Status GenerateShaderCode(ShaderHelper& sh) const override; + + WEBGPU_PROGRAM_DEFINE_UNIFORM_VARIABLES({"scale", ProgramUniformVariableDataType::Float32}, + {"global_shape", ProgramUniformVariableDataType::Uint32}, + {"global_stride", ProgramUniformVariableDataType::Uint32}, + {"input_output_stride", ProgramUniformVariableDataType::Uint32}); + + private: + const bool interleaved_; +}; + +class RotaryEmbedding final : public WebGpuKernel { + public: + RotaryEmbedding(const OpKernelInfo& info); + Status ComputeInternal(ComputeContext& context) const override; + + private: + float scale_; + int num_heads_; + int rotary_embedding_dim_; + bool interleaved_; + bool is_packed_batching_; +}; + +} // namespace webgpu +} // namespace contrib +} // namespace onnxruntime diff --git a/onnxruntime/contrib_ops/webgpu/webgpu_contrib_kernels.cc b/onnxruntime/contrib_ops/webgpu/webgpu_contrib_kernels.cc index def104b6cb108..01c8a28d45069 100644 --- a/onnxruntime/contrib_ops/webgpu/webgpu_contrib_kernels.cc +++ b/onnxruntime/contrib_ops/webgpu/webgpu_contrib_kernels.cc @@ -47,7 +47,7 @@ Status RegisterWebGpuContribKernels(KernelRegistry& kernel_registry) { // BuildKernelCreateInfo, // BuildKernelCreateInfo, // BuildKernelCreateInfo, - // BuildKernelCreateInfo, + BuildKernelCreateInfo, // BuildKernelCreateInfo, // BuildKernelCreateInfo Date: Fri, 27 Sep 2024 14:57:21 +0800 Subject: [PATCH 6/6] [webgpu native] Add transpose shared (#22098) ### Description ### Motivation and Context --- .../core/providers/webgpu/tensor/transpose.cc | 91 +++++++++++++++---- .../core/providers/webgpu/tensor/transpose.h | 24 ++--- 2 files changed, 87 insertions(+), 28 deletions(-) diff --git a/onnxruntime/core/providers/webgpu/tensor/transpose.cc b/onnxruntime/core/providers/webgpu/tensor/transpose.cc index 0962d9191d785..e0a0113e13224 100644 --- a/onnxruntime/core/providers/webgpu/tensor/transpose.cc +++ b/onnxruntime/core/providers/webgpu/tensor/transpose.cc @@ -47,11 +47,11 @@ ONNX_OPERATOR_KERNEL_EX( .TypeConstraint("T", WebGpuSupportedNumberTypes()), Transpose); -const std::string AppendPermFunction(gsl::span perm) { +const std::string AppendPermFunction(gsl::span perm) { std::ostringstream ss; ss.imbue(std::locale::classic()); - ss << "fn perm(i: y_indices_t)->x_indices_t {\n" - " var a: x_indices_t;\n"; + ss << "fn perm(i: output_indices_t)->a_indices_t {\n" + " var a: a_indices_t;\n"; for (size_t i = 0; i < perm.size(); ++i) { ss << " a[" << perm[i] << "] = i[" << i << "];\n"; } @@ -60,21 +60,52 @@ const std::string AppendPermFunction(gsl::span perm) { return ss.str(); } +auto SqueezeShape(const gsl::span& shape, const gsl::span& adjusted_perm, InlinedVector& new_shape, InlinedVector& new_perm) { + for (auto i = 0; i < shape.size(); ++i) { + if (shape[i] != 1) { + new_shape.push_back(shape[i]); + } + if (shape[adjusted_perm[i]] != 1) { + new_perm.push_back(adjusted_perm[i]); + } + } +}; + Status TransposeProgram::GenerateShaderCode(ShaderHelper& shader) const { - const auto& input = shader.AddInput("x", ShaderUsage::UseUniform | ShaderUsage::UseIndicesTypeAlias); - const auto& output = shader.AddOutput("y", ShaderUsage::UseUniform | ShaderUsage::UseIndicesTypeAlias); - shader.AppendImplementation(AppendPermFunction(this->perm_)); - shader.SetMainFunctionBody(shader.GuardAgainstOutOfBoundsWorkgroupSizes("uniforms.output_size"), - " let indices = ", output.OffsetToIndices("global_idx"), - ";\n" - " let x_indices = perm(indices); \n" - " ", - output.SetByOffset("global_idx", input.GetByIndices("x_indices"))); + const auto& input = shader.AddInput("a", ShaderUsage::UseUniform | ShaderUsage::UseIndicesTypeAlias); + const auto& output = shader.AddOutput("output", ShaderUsage::UseUniform | ShaderUsage::UseIndicesTypeAlias | ShaderUsage::UseValueTypeAlias); + + if (use_shared_) { + shader.AppendImplementation("var tile : array, tile_size>;\n"); + shader.SetMainFunctionBody( + " let stride = (uniforms.output_shape[1] - 1) / tile_size + 1;\n" + " let workgroup_id_x = workgroup_idx % stride;\n" + " let workgroup_id_y = workgroup_idx / stride;\n" + " let input_col = workgroup_id_y * tile_size + local_id.x;\n" + " let input_row = workgroup_id_x * tile_size + local_id.y;\n" + " if (input_row < uniforms.a_shape[0] && input_col < uniforms.a_shape[1]) {\n" + " tile[local_id.y][local_id.x] = " + + input.GetByIndices("a_indices_t(input_row, input_col)") + + ";\n" + " }\n" + " workgroupBarrier();\n" + " let output_col = workgroup_id_x * tile_size + local_id.x;\n" + " let output_row = workgroup_id_y * tile_size + local_id.y;\n" + " if (output_row < uniforms.output_shape[0] && output_col < uniforms.output_shape[1]) {\n " + + output.SetByIndices("output_indices_t(output_row, output_col)", "tile[local_id.x][local_id.y]") + "\n }"); + } else { + shader.AppendImplementation(AppendPermFunction(this->perm_)); + shader.SetMainFunctionBody(shader.GuardAgainstOutOfBoundsWorkgroupSizes("uniforms.output_size"), + " let indices = ", output.OffsetToIndices("global_idx"), + ";\n" + " let x_indices = perm(indices);\n", + " ", + output.SetByOffset("global_idx", input.GetByIndices("x_indices"))); + } return Status::OK(); } Status Transpose::ComputeInternal(ComputeContext& context) const { - // TODO: there is an optimized version of transpose to port. const auto* input_tensor = context.Input(0); const TensorShape& input_shape = input_tensor->Shape(); int32_t rank = gsl::narrow_cast(input_shape.NumDimensions()); @@ -86,16 +117,42 @@ Status Transpose::ComputeInternal(ComputeContext& context) const { TensorShape output_shape(output_dims); auto* output_tensor = context.Output(0, output_shape); + InlinedVector new_shape{}; + InlinedVector new_perm{}; + SqueezeShape(input_shape.GetDims(), *p_perm, new_shape, new_perm); + const bool channels_last = new_perm == InlinedVector({2, 3, 1}); + const bool channels_first = new_perm == InlinedVector({3, 1, 2}); + const bool use_shared = (new_shape.size() == 2 && new_perm[0] > new_perm[1]) || channels_last || channels_first; + auto new_input_shape = input_shape; + TensorShape new_output_shape(output_dims); + if (use_shared) { + new_input_shape = channels_last + ? TensorShape({new_shape[0], new_shape[1] * new_shape[2]}) + : channels_first + ? TensorShape({new_shape[0] * new_shape[1], new_shape[2]}) + : new_shape; + new_output_shape = TensorShape({new_input_shape[1], new_input_shape[0]}); + } + uint32_t output_size = gsl::narrow_cast(input_tensor->Shape().Size()); - TransposeProgram program{*p_perm}; + TransposeProgram program{*p_perm, use_shared}; + if (use_shared) { + program.SetWorkgroupSize(TILE_SIZE, TILE_SIZE, 1); + } + program .CacheHint(absl::StrJoin(*p_perm, "-")) - .AddInputs({{input_tensor, ProgramTensorMetadataDependency::TypeAndRank}}) - .AddOutputs({output_tensor}) - .SetDispatchGroupSize((output_size + WORKGROUP_SIZE - 1) / WORKGROUP_SIZE) + .AddInputs({{input_tensor, ProgramTensorMetadataDependency::TypeAndRank, new_input_shape, 1}}) + .AddOutputs({{output_tensor, ProgramTensorMetadataDependency::None, new_output_shape, 1}}) + .SetDispatchGroupSize(static_cast((new_output_shape[1] + TILE_SIZE - 1) / TILE_SIZE), + static_cast(((new_output_shape[0] + TILE_SIZE - 1) / TILE_SIZE))) .AddUniformVariables({ {static_cast(output_size)}, }); + + use_shared ? program.SetDispatchGroupSize(static_cast((new_output_shape[1] + TILE_SIZE - 1) / TILE_SIZE), + static_cast(((new_output_shape[0] + TILE_SIZE - 1) / TILE_SIZE))) + : program.SetDispatchGroupSize((output_size + WORKGROUP_SIZE - 1) / WORKGROUP_SIZE); return context.RunProgram(program); } diff --git a/onnxruntime/core/providers/webgpu/tensor/transpose.h b/onnxruntime/core/providers/webgpu/tensor/transpose.h index 3ca5674d5dfab..7cf5c1fe0865d 100644 --- a/onnxruntime/core/providers/webgpu/tensor/transpose.h +++ b/onnxruntime/core/providers/webgpu/tensor/transpose.h @@ -11,26 +11,28 @@ namespace onnxruntime { namespace webgpu { +class Transpose final : public WebGpuKernel, public TransposeBase { + public: + Transpose(const OpKernelInfo& info) : WebGpuKernel{info}, TransposeBase{info} { + } + Status ComputeInternal(ComputeContext& context) const override; + constexpr static uint32_t TILE_SIZE = 16; +}; + class TransposeProgram final : public Program { public: - TransposeProgram(const gsl::span& permutations) - : Program{"Transpose"}, perm_(permutations.begin(), permutations.end()) { + TransposeProgram(const gsl::span& permutations, bool use_shared) + : Program{"Transpose"}, perm_(permutations.begin(), permutations.end()), use_shared_(use_shared) { } Status GenerateShaderCode(ShaderHelper& sh) const override; WEBGPU_PROGRAM_DEFINE_UNIFORM_VARIABLES({"output_size", ProgramUniformVariableDataType::Uint32}); + WEBGPU_PROGRAM_DEFINE_CONSTANTS({"tile_size", Transpose::TILE_SIZE}); private: - InlinedVector perm_; -}; - -class Transpose final : public WebGpuKernel, public TransposeBase { - public: - Transpose(const OpKernelInfo& info) : WebGpuKernel{info}, TransposeBase{info} { - } - - Status ComputeInternal(ComputeContext& context) const override; + InlinedVector perm_; + const bool use_shared_; }; } // namespace webgpu