From 0101ce8fa32856cfc62a2402f7a8f5c5938782d7 Mon Sep 17 00:00:00 2001
From: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date: Wed, 25 Sep 2024 16:38:02 -0700
Subject: [PATCH 1/6] remove toggle "disable_workgroup_init"

---
 onnxruntime/core/providers/webgpu/webgpu_context.cc | 1 -
 1 file changed, 1 deletion(-)
diff --git a/onnxruntime/core/providers/webgpu/webgpu_context.cc b/onnxruntime/core/providers/webgpu/webgpu_context.cc
index f2414c14f6f9a..7dbccb532dd55 100644
--- a/onnxruntime/core/providers/webgpu/webgpu_context.cc
+++ b/onnxruntime/core/providers/webgpu/webgpu_context.cc
@@ -387,7 +387,6 @@ std::vector<const char*> WebGpuContext::GetEnabledDeviceToggles() const {
   constexpr const char* toggles[] = {
       "skip_validation",  // only use "skip_validation" when ValidationMode is set to "Disabled"
       "disable_robustness",
-      "disable_workgroup_init",
       "d3d_disable_ieee_strictness",
   };
   return std::vector<const char*>(ValidationMode() >= ValidationMode::WGPUOnly

From 38967062675ef6906824f5317732b5630054220c Mon Sep 17 00:00:00 2001
From: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date: Wed, 25 Sep 2024 16:44:32 -0700
Subject: [PATCH 2/6] set backend type to D3D12 since we always uses dxc (win).

---
 onnxruntime/core/providers/webgpu/webgpu_context.cc | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/onnxruntime/core/providers/webgpu/webgpu_context.cc b/onnxruntime/core/providers/webgpu/webgpu_context.cc
index 7dbccb532dd55..e9ae97369c6c3 100644
--- a/onnxruntime/core/providers/webgpu/webgpu_context.cc
+++ b/onnxruntime/core/providers/webgpu/webgpu_context.cc
@@ -33,6 +33,9 @@ void WebGpuContext::Initialize(const WebGpuExecutionProviderInfo& webgpu_ep_info
       wgpu::RequestAdapterOptions req_adapter_options = {};
       wgpu::DawnTogglesDescriptor adapter_toggles_desc = {};
       req_adapter_options.nextInChain = &adapter_toggles_desc;
+#ifdef WIN32
+      req_adapter_options.backendType = wgpu::BackendType::D3D12;
+#endif
 
       auto enabled_adapter_toggles = GetEnabledAdapterToggles();
       adapter_toggles_desc.enabledToggleCount = enabled_adapter_toggles.size();

From f02e85a3adefb62649e97c6fadd5851e6fa1ab2c Mon Sep 17 00:00:00 2001
From: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date: Wed, 25 Sep 2024 16:53:41 -0700
Subject: [PATCH 3/6] update build configurations to webgpu EP (#22047)

### Description

---------

Co-authored-by: Scott McKay <Scott.McKay@microsoft.com>
---
 .../external/onnxruntime_external_deps.cmake  | 82 ++++++++++++++-----
 cmake/onnxruntime.cmake                       | 66 +++++++++++++--
 cmake/onnxruntime_providers_webgpu.cmake      | 12 +--
 cmake/patches/dawn/dawn.patch                 | 66 +++++++++++++++
 .../webgpu/webgpu_provider_factory.h          | 14 ++++
 .../main/java/ai/onnxruntime/OrtProvider.java |  4 +-
 .../platform/apple/logging/apple_log_sink.mm  |  2 -
 .../webgpu/math/unary_elementwise_ops.cc      | 12 +++
 .../core/providers/webgpu/shader_variable.h   |  3 +
 .../core/providers/webgpu/tensor/where.cc     |  6 +-
 .../core/providers/webgpu/webgpu_context.cc   |  5 ++
 .../ios_package_uitest_cpp_api.mm             | 23 +++++-
 .../macos_package_uitest_cpp_api.mm           | 24 +++++-
 .../default_full_aar_build_settings.json      |  1 +
 .../apple/build_and_assemble_apple_pods.py    |  2 +
 ...t_full_apple_framework_build_settings.json |  1 +
 ...ult_full_ios_framework_build_settings.json |  2 +
 .../templates/mac-cpu-packing-jobs.yml        |  6 +-
 18 files changed, 277 insertions(+), 54 deletions(-)
 create mode 100644 cmake/patches/dawn/dawn.patch
 create mode 100644 include/onnxruntime/core/providers/webgpu/webgpu_provider_factory.h

diff --git a/cmake/external/onnxruntime_external_deps.cmake b/cmake/external/onnxruntime_external_deps.cmake
index 370a2d5c72351..6f54ce1b4face 100644
--- a/cmake/external/onnxruntime_external_deps.cmake
+++ b/cmake/external/onnxruntime_external_deps.cmake
@@ -575,10 +575,11 @@ if (onnxruntime_USE_MIMALLOC)
   onnxruntime_fetchcontent_makeavailable(mimalloc)
 endif()
 
-#onnxruntime_EXTERNAL_LIBRARIES could contain onnx, onnx_proto,libprotobuf, cuda/cudnn,
-# dnnl/mklml, onnxruntime_codegen_tvm, tvm and pthread
-# pthread is always at the last
-set(onnxruntime_EXTERNAL_LIBRARIES ${onnxruntime_EXTERNAL_LIBRARIES_XNNPACK} ${WIL_TARGET} nlohmann_json::nlohmann_json onnx onnx_proto ${PROTOBUF_LIB} re2::re2 Boost::mp11 safeint_interface flatbuffers::flatbuffers ${GSL_TARGET} ${ABSEIL_LIBS} date::date ${ONNXRUNTIME_CLOG_TARGET_NAME})
+set(onnxruntime_EXTERNAL_LIBRARIES ${onnxruntime_EXTERNAL_LIBRARIES_XNNPACK} ${WIL_TARGET} nlohmann_json::nlohmann_json
+                                   onnx onnx_proto ${PROTOBUF_LIB} re2::re2 Boost::mp11 safeint_interface
+                                   flatbuffers::flatbuffers ${GSL_TARGET} ${ABSEIL_LIBS} date::date
+                                   ${ONNXRUNTIME_CLOG_TARGET_NAME})
+
 # The source code of onnx_proto is generated, we must build this lib first before starting to compile the other source code that uses ONNX protobuf types.
 # The other libs do not have the problem. All the sources are already there. We can compile them in any order.
 set(onnxruntime_EXTERNAL_DEPENDENCIES onnx_proto flatbuffers::flatbuffers)
@@ -638,33 +639,70 @@ if (onnxruntime_USE_WEBGPU)
     dawn
     URL ${DEP_URL_dawn}
     URL_HASH SHA1=${DEP_SHA1_dawn}
+    PATCH_COMMAND ${Patch_EXECUTABLE} --binary --ignore-whitespace -p1 < ${PROJECT_SOURCE_DIR}/patches/dawn/dawn.patch
   )
-  set(DAWN_FETCH_DEPENDENCIES ON)
-  set(DAWN_ENABLE_INSTALL ON)
-  set(TINT_BUILD_TESTS OFF)
-  set(DAWN_USE_BUILT_DXC ON)
+
+  # use dawn::dawn_native and dawn::dawn_proc instead of the monolithic dawn::webgpu_dawn to minimize binary size
+  set(DAWN_BUILD_MONOLITHIC_LIBRARY OFF CACHE BOOL "" FORCE)
+  set(DAWN_BUILD_SAMPLES OFF CACHE BOOL "" FORCE)
+  set(DAWN_ENABLE_INSTALL OFF CACHE BOOL "" FORCE)
+  set(DAWN_ENABLE_NULL OFF CACHE BOOL "" FORCE)
+  set(DAWN_FETCH_DEPENDENCIES ON CACHE BOOL "" FORCE)
+
+  # disable things we don't use
   set(DAWN_DXC_ENABLE_ASSERTS_IN_NDEBUG OFF)
-  onnxruntime_fetchcontent_makeavailable(dawn)
-endif()
+  set(DAWN_ENABLE_DESKTOP_GL OFF CACHE BOOL "" FORCE)
+  set(DAWN_ENABLE_OPENGLES OFF CACHE BOOL "" FORCE)
+  set(DAWN_SUPPORTS_GLFW_FOR_WINDOWING OFF CACHE BOOL "" FORCE)
+  set(DAWN_USE_GLFW OFF CACHE BOOL "" FORCE)
+  set(DAWN_USE_WINDOWS_UI OFF CACHE BOOL "" FORCE)
+  set(DAWN_USE_X11 OFF CACHE BOOL "" FORCE)
+
+  set(TINT_BUILD_TESTS OFF CACHE BOOL "" FORCE)
+  set(TINT_BUILD_CMD_TOOLS OFF CACHE BOOL "" FORCE)
+  set(TINT_BUILD_GLSL_WRITER OFF CACHE BOOL "" FORCE)
+  set(TINT_BUILD_GLSL_VALIDATOR OFF CACHE BOOL "" FORCE)
+  set(TINT_BUILD_IR_BINARY OFF CACHE BOOL "" FORCE)
+  set(TINT_BUILD_SPV_READER OFF CACHE BOOL "" FORCE)  # don't need. disabling is a large binary size saving
+  set(TINT_BUILD_WGSL_WRITER ON CACHE BOOL "" FORCE)  # needed to create cache key. runtime error if not enabled.
+
+  # SPIR-V validation shouldn't be required given we're using Tint to create the SPIR-V.
+  if (NOT CMAKE_BUILD_TYPE STREQUAL "Debug")
+    set(DAWN_ENABLE_SPIRV_VALIDATION OFF CACHE BOOL "" FORCE)
+  endif()
 
-message(STATUS "Finished fetching external dependencies")
+  if (WIN32)
+    # building this requires the HLSL writer to be enabled in Tint. TBD if that we need either of these to be ON.
+    set(DAWN_USE_BUILT_DXC ON CACHE BOOL "" FORCE)
+    set(TINT_BUILD_HLSL_WRITER ON CACHE BOOL "" FORCE)
 
-set(onnxruntime_LINK_DIRS )
+    # Vulkan may optionally be included in a Windows build. Exclude until we have an explicit use case that requires it.
+    set(DAWN_ENABLE_VULKAN OFF CACHE BOOL "" FORCE)
+  endif()
 
+  onnxruntime_fetchcontent_makeavailable(dawn)
+
+  list(APPEND onnxruntime_EXTERNAL_LIBRARIES dawn::dawn_native dawn::dawn_proc)
+endif()
+
+set(onnxruntime_LINK_DIRS)
 if (onnxruntime_USE_CUDA)
-      find_package(CUDAToolkit REQUIRED)
+  find_package(CUDAToolkit REQUIRED)
 
-      if(onnxruntime_CUDNN_HOME)
-        file(TO_CMAKE_PATH ${onnxruntime_CUDNN_HOME} onnxruntime_CUDNN_HOME)
-        set(CUDNN_PATH ${onnxruntime_CUDNN_HOME})
-      endif()
-      include(cuDNN)
+  if(onnxruntime_CUDNN_HOME)
+    file(TO_CMAKE_PATH ${onnxruntime_CUDNN_HOME} onnxruntime_CUDNN_HOME)
+    set(CUDNN_PATH ${onnxruntime_CUDNN_HOME})
+  endif()
+
+  include(cuDNN)
 endif()
 
 if(onnxruntime_USE_SNPE)
-    include(external/find_snpe.cmake)
-    list(APPEND onnxruntime_EXTERNAL_LIBRARIES ${SNPE_NN_LIBS})
+  include(external/find_snpe.cmake)
+  list(APPEND onnxruntime_EXTERNAL_LIBRARIES ${SNPE_NN_LIBS})
 endif()
 
-FILE(TO_NATIVE_PATH ${CMAKE_BINARY_DIR}  ORT_BINARY_DIR)
-FILE(TO_NATIVE_PATH ${PROJECT_SOURCE_DIR}  ORT_SOURCE_DIR)
+FILE(TO_NATIVE_PATH ${CMAKE_BINARY_DIR} ORT_BINARY_DIR)
+FILE(TO_NATIVE_PATH ${PROJECT_SOURCE_DIR} ORT_SOURCE_DIR)
+
+message(STATUS "Finished fetching external dependencies")
diff --git a/cmake/onnxruntime.cmake b/cmake/onnxruntime.cmake
index 9b6acea876f95..b1d797ca16adc 100644
--- a/cmake/onnxruntime.cmake
+++ b/cmake/onnxruntime.cmake
@@ -89,10 +89,22 @@ elseif(onnxruntime_BUILD_APPLE_FRAMEWORK)
   # create Info.plist for the framework and podspec for CocoaPods (optional)
   set(MACOSX_FRAMEWORK_NAME "onnxruntime")
   set(MACOSX_FRAMEWORK_IDENTIFIER "com.microsoft.onnxruntime")
-  # Need to include CoreML as a weaklink for CocoaPods package if the EP is enabled
+
+  # Setup weak frameworks for macOS/iOS. 'weak' as the CoreML or WebGPU EPs are optionally enabled.
   if(onnxruntime_USE_COREML)
-    set(APPLE_WEAK_FRAMEWORK "\\\"CoreML\\\"")
+    list(APPEND _weak_frameworks "\\\"CoreML\\\"")
+  endif()
+
+  if(onnxruntime_USE_WEBGPU)
+    list(APPEND _weak_frameworks "\\\"QuartzCore\\\"")
+    list(APPEND _weak_frameworks "\\\"IOSurface\\\"")
+    list(APPEND _weak_frameworks "\\\"Metal\\\"")
   endif()
+
+  if (_weak_frameworks)
+    string(JOIN ", " APPLE_WEAK_FRAMEWORK ${_weak_frameworks})
+  endif()
+
   set(INFO_PLIST_PATH "${CMAKE_CURRENT_BINARY_DIR}/Info.plist")
   configure_file(${REPO_ROOT}/cmake/Info.plist.in ${INFO_PLIST_PATH})
   configure_file(
@@ -364,16 +376,58 @@ if(onnxruntime_BUILD_APPLE_FRAMEWORK)
     endif()
   endforeach()
 
+  # helper function that recurses to also handle static library dependencies of the ORT external libraries
+  set(_processed_libs)  # keep track of processed libraries to skip any duplicate dependencies
+  function(add_symlink_for_static_lib_and_dependencies lib)
+    function(process cur_target)
+      # de-alias if applicable so a consistent target name is used
+      get_target_property(alias ${cur_target} ALIASED_TARGET)
+      if(TARGET ${alias})
+        set(cur_target ${alias})
+      endif()
+
+      if(${cur_target} IN_LIST _processed_libs OR ${cur_target} IN_LIST lib_and_dependencies)
+        return()
+      endif()
+
+      list(APPEND lib_and_dependencies ${cur_target})
+
+      get_target_property(link_libraries ${cur_target} LINK_LIBRARIES)
+      foreach(dependency ${link_libraries})
+        if(TARGET ${dependency})
+          process(${dependency})
+        endif()
+      endforeach()
+
+      set(lib_and_dependencies ${lib_and_dependencies} PARENT_SCOPE)
+    endfunction()
+
+    set(lib_and_dependencies)
+    process(${lib})
+
+    foreach(_target ${lib_and_dependencies})
+      get_target_property(type ${_target} TYPE)
+      if(${type} STREQUAL "STATIC_LIBRARY")
+        # message(STATUS "Adding symlink for ${_target}")
+        add_custom_command(TARGET onnxruntime POST_BUILD
+                           COMMAND ${CMAKE_COMMAND} -E create_symlink
+                             $<TARGET_FILE:${_target}> ${STATIC_LIB_DIR}/$<TARGET_LINKER_FILE_NAME:${_target}>)
+      endif()
+    endforeach()
+
+    list(APPEND _processed_libs ${lib_and_dependencies})
+    set(_processed_libs ${_processed_libs} PARENT_SCOPE)
+  endfunction()
+
   # for external libraries we create a symlink to the .a file
   foreach(_LIB ${onnxruntime_EXTERNAL_LIBRARIES})
-    if(NOT TARGET ${_LIB}) # if we didn't build from source. it may not a target
+    if(NOT TARGET ${_LIB}) # if we didn't build from source it may not be a target
       continue()
     endif()
+
     GET_TARGET_PROPERTY(_LIB_TYPE ${_LIB} TYPE)
     if(_LIB_TYPE STREQUAL "STATIC_LIBRARY")
-      add_custom_command(TARGET onnxruntime POST_BUILD
-                         COMMAND ${CMAKE_COMMAND} -E create_symlink
-                           $<TARGET_FILE:${_LIB}> ${STATIC_LIB_DIR}/$<TARGET_LINKER_FILE_NAME:${_LIB}>)
+      add_symlink_for_static_lib_and_dependencies(${_LIB})
     endif()
   endforeach()
 
diff --git a/cmake/onnxruntime_providers_webgpu.cmake b/cmake/onnxruntime_providers_webgpu.cmake
index 587c4b2c1ff2c..8d00ab5aa4494 100644
--- a/cmake/onnxruntime_providers_webgpu.cmake
+++ b/cmake/onnxruntime_providers_webgpu.cmake
@@ -24,14 +24,8 @@
 
   source_group(TREE ${REPO_ROOT} FILES ${onnxruntime_providers_webgpu_cc_srcs})
   onnxruntime_add_static_library(onnxruntime_providers_webgpu ${onnxruntime_providers_webgpu_cc_srcs})
-  onnxruntime_add_include_to_target(onnxruntime_providers_webgpu onnxruntime_common onnx onnx_proto flatbuffers::flatbuffers Boost::mp11 safeint_interface)
-  target_link_libraries(onnxruntime_providers_webgpu dawn::webgpu_dawn)
-
-  # Copy webgpu_dawn.dll to the output directory
-  add_custom_command(
-    TARGET onnxruntime_providers_webgpu
-    POST_BUILD
-    COMMAND ${CMAKE_COMMAND} -E copy_if_different "$<TARGET_FILE:dawn::webgpu_dawn>" "$<TARGET_FILE_DIR:onnxruntime_providers_webgpu>"
-    VERBATIM )
+  onnxruntime_add_include_to_target(onnxruntime_providers_webgpu
+    onnxruntime_common dawn::dawncpp_headers dawn::dawn_headers onnx onnx_proto flatbuffers::flatbuffers Boost::mp11 safeint_interface)
+  target_link_libraries(onnxruntime_providers_webgpu dawn::dawn_native dawn::dawn_proc)
 
   set_target_properties(onnxruntime_providers_webgpu PROPERTIES FOLDER "ONNXRuntime")
diff --git a/cmake/patches/dawn/dawn.patch b/cmake/patches/dawn/dawn.patch
new file mode 100644
index 0000000000000..d696d386452e8
--- /dev/null
+++ b/cmake/patches/dawn/dawn.patch
@@ -0,0 +1,66 @@
+diff --git a/src/dawn/native/CMakeLists.txt b/src/dawn/native/CMakeLists.txt
+index 9c0bd6fa4e..bf8a57aeac 100644
+--- a/src/dawn/native/CMakeLists.txt
++++ b/src/dawn/native/CMakeLists.txt
+@@ -857,6 +857,11 @@ if (DAWN_ENABLE_SWIFTSHADER)
+     target_compile_definitions(dawn_native PRIVATE "DAWN_ENABLE_SWIFTSHADER")
+ endif()
+
++if (IOS)
++    target_compile_options(dawn_native_objects PRIVATE -fno-objc-arc)
++    target_compile_options(dawn_native PRIVATE -fno-objc-arc)
++endif()
++
+ if (DAWN_BUILD_MONOLITHIC_LIBRARY)
+     ###############################################################################
+     # Do the 'complete_lib' build.
+diff --git a/src/dawn/native/Surface_metal.mm b/src/dawn/native/Surface_metal.mm
+index ce55acbd43..baa4835362 100644
+--- a/src/dawn/native/Surface_metal.mm
++++ b/src/dawn/native/Surface_metal.mm
+@@ -36,7 +36,13 @@
+ namespace dawn::native {
+
+ bool InheritsFromCAMetalLayer(void* obj) {
+-    id<NSObject> object = static_cast<id>(obj);
++    id<NSObject> object =
++#if TARGET_OS_IOS
++        (__bridge id)obj;
++#else
++        static_cast<id>(obj);
++#endif
++
+     return [object isKindOfClass:[CAMetalLayer class]];
+ }
+
+diff --git a/src/dawn/native/metal/SharedFenceMTL.mm b/src/dawn/native/metal/SharedFenceMTL.mm
+index bde8bfea07..f2f6459e91 100644
+--- a/src/dawn/native/metal/SharedFenceMTL.mm
++++ b/src/dawn/native/metal/SharedFenceMTL.mm
+@@ -40,7 +40,13 @@ ResultOrError<Ref<SharedFence>> SharedFence::Create(
+     DAWN_INVALID_IF(descriptor->sharedEvent == nullptr, "MTLSharedEvent is missing.");
+     if (@available(macOS 10.14, iOS 12.0, *)) {
+         return AcquireRef(new SharedFence(
+-            device, label, static_cast<id<MTLSharedEvent>>(descriptor->sharedEvent)));
++            device, label,
++#if TARGET_OS_IOS
++            (__bridge id<MTLSharedEvent>)(descriptor->sharedEvent)
++#else
++            static_cast<id<MTLSharedEvent>>(descriptor->sharedEvent)
++#endif
++            ));
+     } else {
+         return DAWN_INTERNAL_ERROR("MTLSharedEvent not supported.");
+     }
+diff --git a/src/tint/api/BUILD.cmake b/src/tint/api/BUILD.cmake
+index 0037d83276..6372c4ee77 100644
+--- a/src/tint/api/BUILD.cmake
++++ b/src/tint/api/BUILD.cmake
+@@ -57,6 +57,7 @@ tint_target_add_dependencies(tint_api lib
+   tint_lang_wgsl_ast_transform
+   tint_lang_wgsl_common
+   tint_lang_wgsl_features
++  tint_lang_wgsl_inspector
+   tint_lang_wgsl_program
+   tint_lang_wgsl_sem
+   tint_lang_wgsl_writer_ir_to_program
diff --git a/include/onnxruntime/core/providers/webgpu/webgpu_provider_factory.h b/include/onnxruntime/core/providers/webgpu/webgpu_provider_factory.h
new file mode 100644
index 0000000000000..0b45b847d651f
--- /dev/null
+++ b/include/onnxruntime/core/providers/webgpu/webgpu_provider_factory.h
@@ -0,0 +1,14 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+// Dummy file to provide a signal in the ONNX Runtime C cocoapod as to whether the WebGPU EP was included in the build.
+// If it was, this file will be included in the cocoapod, and a test like this can be used:
+//
+//   #if __has_include(<onnxruntime/webgpu_provider_factory.h>)
+//     #define WEBGPU_EP_AVAILABLE 1
+//   #else
+//     #define WEBGPU_EP_AVAILABLE 0
+//   #endif
+
+// The WebGPU EP can be enabled via the generic SessionOptionsAppendExecutionProvider method, so no direct usage of
+// the provider factory is required.
diff --git a/java/src/main/java/ai/onnxruntime/OrtProvider.java b/java/src/main/java/ai/onnxruntime/OrtProvider.java
index ae9cb9f908629..b06f884896ee8 100644
--- a/java/src/main/java/ai/onnxruntime/OrtProvider.java
+++ b/java/src/main/java/ai/onnxruntime/OrtProvider.java
@@ -40,7 +40,9 @@ public enum OrtProvider {
   /** The XNNPACK execution provider. */
   XNNPACK("XnnpackExecutionProvider"),
   /** The Azure remote endpoint execution provider. */
-  AZURE("AzureExecutionProvider");
+  AZURE("AzureExecutionProvider"),
+  /** The WebGPU execution provider */
+  WEBGPU("WebGpuExecutionProvider");
 
   private static final Map<String, OrtProvider> valueMap = new HashMap<>(values().length);
 
diff --git a/onnxruntime/core/platform/apple/logging/apple_log_sink.mm b/onnxruntime/core/platform/apple/logging/apple_log_sink.mm
index 00e691a8f9fd3..6abbe76a7f151 100644
--- a/onnxruntime/core/platform/apple/logging/apple_log_sink.mm
+++ b/onnxruntime/core/platform/apple/logging/apple_log_sink.mm
@@ -7,8 +7,6 @@
 
 #include <sstream>
 
-#include "date/date.h"
-
 namespace onnxruntime {
 namespace logging {
 
diff --git a/onnxruntime/core/providers/webgpu/math/unary_elementwise_ops.cc b/onnxruntime/core/providers/webgpu/math/unary_elementwise_ops.cc
index 3b43c87fb0c82..9e8117aa34a92 100644
--- a/onnxruntime/core/providers/webgpu/math/unary_elementwise_ops.cc
+++ b/onnxruntime/core/providers/webgpu/math/unary_elementwise_ops.cc
@@ -165,7 +165,19 @@ WEBGPU_ELEMENTWISE_KERNEL(Asinh, 9, WebGpuSupportedFloatTypes())
 WEBGPU_ELEMENTWISE_IMPL(Acosh, "acosh(a)")
 WEBGPU_ELEMENTWISE_KERNEL(Acosh, 9, WebGpuSupportedFloatTypes())
 
+#if __APPLE__
+// Metal returns 0 for values >= 1.0.
+// Need custom impl to return +inf for 1.0 (by dividing 1 by 0), and NaN for > 1.0 (by dividing 0 by 0)
+WEBGPU_ELEMENTWISE_IMPL(Atanh,
+                        "select("
+                        " select(x_value_t(1.0), x_value_t(0.0), a > x_value_t(1.0)) / x_value_t(0.0),"
+                        " atanh(a),"
+                        " a < x_value_t(1.0))",
+                        "",
+                        ShaderUsage::UseValueTypeAlias)
+#else
 WEBGPU_ELEMENTWISE_IMPL(Atanh, "atanh(a)")
+#endif
 WEBGPU_ELEMENTWISE_KERNEL(Atanh, 9, WebGpuSupportedFloatTypes())
 
 WEBGPU_ELEMENTWISE_IMPL(Not, "!a")
diff --git a/onnxruntime/core/providers/webgpu/shader_variable.h b/onnxruntime/core/providers/webgpu/shader_variable.h
index 2ddc9a6e8160f..72f38aecb99ce 100644
--- a/onnxruntime/core/providers/webgpu/shader_variable.h
+++ b/onnxruntime/core/providers/webgpu/shader_variable.h
@@ -67,6 +67,9 @@ class ShaderIndicesHelper {
  public:
   ShaderIndicesHelper(std::string_view name, ProgramVariableDataType type, ShaderUsage usage, const TensorShape& dims);
 
+  ShaderIndicesHelper(ShaderIndicesHelper&&) = default;
+  ShaderIndicesHelper& operator=(ShaderIndicesHelper&&) = default;
+
   // get the number of components of the variable.
   inline int NumComponents() const { return num_components_; }
 
diff --git a/onnxruntime/core/providers/webgpu/tensor/where.cc b/onnxruntime/core/providers/webgpu/tensor/where.cc
index 31806a0af1741..1d58538a7489c 100644
--- a/onnxruntime/core/providers/webgpu/tensor/where.cc
+++ b/onnxruntime/core/providers/webgpu/tensor/where.cc
@@ -59,7 +59,7 @@ Status WhereProgram::GenerateShaderCode(ShaderHelper& shader) const {
   const auto& b_input = shader.AddInput("b_data", ShaderUsage::UseUniform);
   const auto& output = shader.AddOutput("output_data", ShaderUsage::UseUniform);
 
-  auto expression = [](const std::string& a, const std::string& b, const std::string& c) -> const auto {
+  const auto expression = [](const std::string& a, const std::string& b, const std::string& c) -> auto {
     return "select(" + b + ", " + a + ", " + c + ")";
   };
   std::string assignment;
@@ -74,10 +74,10 @@ Status WhereProgram::GenerateShaderCode(ShaderHelper& shader) const {
     const auto& b_indices = shader.AddIndices("b_indices");
     const auto& output_indices = shader.AddIndices("output_indices");
 
-    auto single_assignment =
+    const auto single_assignment =
         [&expression, &output_indices, &a_indices, &b_indices, &c_indices](
             const std::string& rest_str, const std::string& x, const std::string& type_cast = "")
-        -> const auto {
+        -> auto {
       const std::string a_expression = "a_data[index_a" + x + "][component_a" + x + "]";
       const std::string b_expression = "b_data[index_b" + x + "][component_b" + x + "]";
       const std::string c_expression = "bool(c_data[index_c" + x + "] & (0xffu << (component_c" + x + " * 8)))";
diff --git a/onnxruntime/core/providers/webgpu/webgpu_context.cc b/onnxruntime/core/providers/webgpu/webgpu_context.cc
index e9ae97369c6c3..bb4ae4f6dcce5 100644
--- a/onnxruntime/core/providers/webgpu/webgpu_context.cc
+++ b/onnxruntime/core/providers/webgpu/webgpu_context.cc
@@ -4,6 +4,9 @@
 #include <memory>
 #include <cmath>
 
+#include "dawn/dawn_proc.h"
+#include "dawn/native/DawnNative.h"
+
 #include "core/common/common.h"
 
 #include "core/providers/webgpu/compute_context.h"
@@ -21,6 +24,8 @@ void WebGpuContext::Initialize(const WebGpuExecutionProviderInfo& webgpu_ep_info
   std::call_once(init_flag_, [this, &webgpu_ep_info]() {
     // Initialization.Step.1 - Create wgpu::Instance
     if (instance_ == nullptr) {
+      dawnProcSetProcs(&dawn::native::GetProcs());
+
       wgpu::InstanceDescriptor instance_desc{};
       instance_desc.features.timedWaitAnyEnable = true;
       instance_ = wgpu::CreateInstance(&instance_desc);
diff --git a/onnxruntime/test/platform/apple/apple_package_test/ios_package_testUITests/ios_package_uitest_cpp_api.mm b/onnxruntime/test/platform/apple/apple_package_test/ios_package_testUITests/ios_package_uitest_cpp_api.mm
index d145a00b1348f..32b4b32e299d6 100644
--- a/onnxruntime/test/platform/apple/apple_package_test/ios_package_testUITests/ios_package_uitest_cpp_api.mm
+++ b/onnxruntime/test/platform/apple/apple_package_test/ios_package_testUITests/ios_package_uitest_cpp_api.mm
@@ -13,15 +13,19 @@
 
 #if __has_include(<onnxruntime/coreml_provider_factory.h>)
 #define COREML_EP_AVAILABLE 1
+#include <onnxruntime/coreml_provider_factory.h>
 #else
 #define COREML_EP_AVAILABLE 0
 #endif
 
-#if COREML_EP_AVAILABLE
-#include <onnxruntime/coreml_provider_factory.h>
+#if __has_include(<onnxruntime/webgpu_provider_factory.h>)
+#define WEBGPU_EP_AVAILABLE 1
+// WebGPU EP doesn't require including the header as it's enabled via AppendExecutionProvider
+#else
+#define WEBGPU_EP_AVAILABLE 0
 #endif
 
-void testSigmoid(const char* modelPath, bool useCoreML) {
+void testSigmoid(const char* modelPath, bool useCoreML = false, bool useWebGPU = false) {
   // This is an e2e test for ORT C++ API
   Ort::Env env(ORT_LOGGING_LEVEL_WARNING, "testCppAPI");
 
@@ -38,6 +42,12 @@ void testSigmoid(const char* modelPath, bool useCoreML) {
   (void)useCoreML;
 #endif
 
+  if (useWebGPU) {
+    std::unordered_map<std::string, std::string> provider_options;
+    // set provider options if needed. e.g. deviceId
+    session_options.AppendExecutionProvider("WebGPU", provider_options);
+  }
+
   Ort::Session session(env, modelPath, session_options);
 
   size_t input_tensor_size = 3 * 4 * 5;
@@ -96,7 +106,7 @@ - (NSString*)getFilePath {
 }
 
 - (void)testCppAPI_Basic {
-  testSigmoid([self getFilePath].UTF8String, false /* useCoreML */);
+  testSigmoid([self getFilePath].UTF8String);
 }
 
 #if COREML_EP_AVAILABLE
@@ -105,4 +115,9 @@ - (void)testCppAPI_Basic_CoreML {
 }
 #endif
 
+#if WEBGPU_EP_AVAILABLE
+- (void)testCppAPI_Basic_WebGPU {
+  testSigmoid([self getFilePath].UTF8String, false /* useCoreML */, true /* useWebGPU */);
+}
+#endif
 @end
diff --git a/onnxruntime/test/platform/apple/apple_package_test/macos_package_testUITests/macos_package_uitest_cpp_api.mm b/onnxruntime/test/platform/apple/apple_package_test/macos_package_testUITests/macos_package_uitest_cpp_api.mm
index 613c6e545939f..86001b6cb50a5 100644
--- a/onnxruntime/test/platform/apple/apple_package_test/macos_package_testUITests/macos_package_uitest_cpp_api.mm
+++ b/onnxruntime/test/platform/apple/apple_package_test/macos_package_testUITests/macos_package_uitest_cpp_api.mm
@@ -13,15 +13,19 @@
 
 #if __has_include(<onnxruntime/coreml_provider_factory.h>)
 #define COREML_EP_AVAILABLE 1
+#include <onnxruntime/coreml_provider_factory.h>
 #else
 #define COREML_EP_AVAILABLE 0
 #endif
 
-#if COREML_EP_AVAILABLE
-#include <onnxruntime/coreml_provider_factory.h>
+#if __has_include(<onnxruntime/webgpu_provider_factory.h>)
+#define WEBGPU_EP_AVAILABLE 1
+// WebGPU EP doesn't require including the header as it's enabled via AppendExecutionProvider
+#else
+#define WEBGPU_EP_AVAILABLE 0
 #endif
 
-void testSigmoid(const char* modelPath, bool useCoreML) {
+void testSigmoid(const char* modelPath, bool useCoreML = false, bool useWebGPU = false) {
   // This is an e2e test for ORT C++ API
   Ort::Env env(ORT_LOGGING_LEVEL_WARNING, "testCppAPI");
 
@@ -38,6 +42,12 @@ void testSigmoid(const char* modelPath, bool useCoreML) {
   (void)useCoreML;
 #endif
 
+  if (useWebGPU) {
+    std::unordered_map<std::string, std::string> provider_options;
+    // set provider options if needed. e.g. deviceId
+    session_options.AppendExecutionProvider("WebGPU", provider_options);
+  }
+
   Ort::Session session(env, modelPath, session_options);
 
   size_t input_tensor_size = 3 * 4 * 5;
@@ -96,7 +106,7 @@ - (NSString*)getFilePath {
 }
 
 - (void)testCppAPI_Basic {
-  testSigmoid([self getFilePath].UTF8String, false /* useCoreML */);
+  testSigmoid([self getFilePath].UTF8String);
 }
 
 #if COREML_EP_AVAILABLE
@@ -105,4 +115,10 @@ - (void)testCppAPI_Basic_CoreML {
 }
 #endif
 
+#if WEBGPU_EP_AVAILABLE
+- (void)testCppAPI_Basic_WebGPU {
+  testSigmoid([self getFilePath].UTF8String, false /* useCoreML */, true /* useWebGPU */);
+}
+#endif
+
 @end
diff --git a/tools/ci_build/github/android/default_full_aar_build_settings.json b/tools/ci_build/github/android/default_full_aar_build_settings.json
index b0eff75812673..f08f246748a5a 100644
--- a/tools/ci_build/github/android/default_full_aar_build_settings.json
+++ b/tools/ci_build/github/android/default_full_aar_build_settings.json
@@ -16,6 +16,7 @@
         "--build_shared_lib",
         "--use_nnapi",
         "--use_xnnpack",
+        "--use_webgpu",
         "--skip_tests"
     ]
 }
diff --git a/tools/ci_build/github/apple/build_and_assemble_apple_pods.py b/tools/ci_build/github/apple/build_and_assemble_apple_pods.py
index 71aeb9e7b0304..dd037c17ae3b3 100755
--- a/tools/ci_build/github/apple/build_and_assemble_apple_pods.py
+++ b/tools/ci_build/github/apple/build_and_assemble_apple_pods.py
@@ -133,6 +133,8 @@ def main():
             str(build_dir / "framework_out"),
             "--variant",
             package_variant.name,
+            "--test_project_stage_dir",  # use a specific directory so it's easier to debug
+            str(build_dir / "test_apple_packages_staging"),
         ]
 
         run(test_apple_packages_args)
diff --git a/tools/ci_build/github/apple/default_full_apple_framework_build_settings.json b/tools/ci_build/github/apple/default_full_apple_framework_build_settings.json
index 84d7e355ed5b4..6175ac3a0ad58 100644
--- a/tools/ci_build/github/apple/default_full_apple_framework_build_settings.json
+++ b/tools/ci_build/github/apple/default_full_apple_framework_build_settings.json
@@ -19,6 +19,7 @@
             "--build_apple_framework",
             "--use_coreml",
             "--use_xnnpack",
+            "--use_webgpu",
             "--skip_tests",
             "--cmake_extra_defines=onnxruntime_BUILD_UNIT_TESTS=OFF"
         ],
diff --git a/tools/ci_build/github/apple/default_full_ios_framework_build_settings.json b/tools/ci_build/github/apple/default_full_ios_framework_build_settings.json
index e2d8f70c02cf3..4c2c9442ab217 100644
--- a/tools/ci_build/github/apple/default_full_ios_framework_build_settings.json
+++ b/tools/ci_build/github/apple/default_full_ios_framework_build_settings.json
@@ -24,12 +24,14 @@
             "--ios",
             "--use_xcode",
             "--use_xnnpack",
+            "--use_webgpu",
             "--apple_deploy_target=13.0"
         ],
         "iphonesimulator": [
             "--ios",
             "--use_xcode",
             "--use_xnnpack",
+            "--use_webgpu",
             "--apple_deploy_target=13.0"
         ],
         "macabi":[
diff --git a/tools/ci_build/github/azure-pipelines/templates/mac-cpu-packing-jobs.yml b/tools/ci_build/github/azure-pipelines/templates/mac-cpu-packing-jobs.yml
index 3b661d9eb2dc6..c2b140652a2dd 100644
--- a/tools/ci_build/github/azure-pipelines/templates/mac-cpu-packing-jobs.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/mac-cpu-packing-jobs.yml
@@ -96,7 +96,7 @@ jobs:
     - template: mac-cpu-packaging-steps.yml
       parameters:
         MacosArch: ${{ parameters.MacosArch }}
-        AdditionalBuildFlags: ${{ parameters.AdditionalBuildFlags }} --use_coreml --cmake_extra_defines CMAKE_OSX_ARCHITECTURES="arm64;x86_64"
+        AdditionalBuildFlags: ${{ parameters.AdditionalBuildFlags }} --use_coreml --use_webgpu --cmake_extra_defines CMAKE_OSX_ARCHITECTURES="arm64;x86_64"
         BuildJava: false
         BuildNodejs: false
         WithCache: ${{ parameters.WithCache }}
@@ -108,7 +108,7 @@ jobs:
     - template: mac-cpu-packaging-steps.yml
       parameters:
         MacosArch: ${{ parameters.MacosArch }}
-        AdditionalBuildFlags: ${{ parameters.AdditionalBuildFlags }} --build_nodejs --build_java --use_coreml --cmake_extra_defines CMAKE_OSX_ARCHITECTURES=arm64
+        AdditionalBuildFlags: ${{ parameters.AdditionalBuildFlags }} --build_nodejs --build_java --use_coreml  --use_webgpu --cmake_extra_defines CMAKE_OSX_ARCHITECTURES=arm64
         BuildJava: true
         BuildNodejs: true
         WithCache: ${{ parameters.WithCache }}
@@ -120,7 +120,7 @@ jobs:
     - template: mac-cpu-packaging-steps.yml
       parameters:
         MacosArch: ${{ parameters.MacosArch }}
-        AdditionalBuildFlags: ${{ parameters.AdditionalBuildFlags }} --build_nodejs --build_java --use_coreml
+        AdditionalBuildFlags: ${{ parameters.AdditionalBuildFlags }} --build_nodejs --build_java --use_coreml  --use_webgpu
         BuildJava: true
         BuildNodejs: true
         WithCache: ${{ parameters.WithCache }}

From e5233ce865bd70d64830698b93e587888ec459c1 Mon Sep 17 00:00:00 2001
From: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date: Wed, 25 Sep 2024 17:01:49 -0700
Subject: [PATCH 4/6] enable build pipeline on Windows for WebGPU

---
 .../win-gpu-webgpu-ci-pipeline.yml            | 58 +++++++++++++++++++
 tools/ci_build/set-trigger-rules.py           |  1 +
 2 files changed, 59 insertions(+)
 create mode 100644 tools/ci_build/github/azure-pipelines/win-gpu-webgpu-ci-pipeline.yml

diff --git a/tools/ci_build/github/azure-pipelines/win-gpu-webgpu-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/win-gpu-webgpu-ci-pipeline.yml
new file mode 100644
index 0000000000000..c4db7735aaf2f
--- /dev/null
+++ b/tools/ci_build/github/azure-pipelines/win-gpu-webgpu-ci-pipeline.yml
@@ -0,0 +1,58 @@
+##### start trigger Don't edit it manually, Please do edit set-trigger-rules.py ####
+### please do rerun set-trigger-rules.py ###
+trigger:
+  branches:
+    include:
+    - main
+    - rel-*
+  paths:
+    exclude:
+    - docs/**
+    - README.md
+    - CONTRIBUTING.md
+    - BUILD.md
+    - 'js/web'
+    - 'onnxruntime/core/providers/js'
+pr:
+  branches:
+    include:
+    - main
+    - rel-*
+  paths:
+    exclude:
+    - docs/**
+    - README.md
+    - CONTRIBUTING.md
+    - BUILD.md
+    - 'js/web'
+    - 'onnxruntime/core/providers/js'
+#### end trigger ####
+
+parameters:
+- name: RunOnnxRuntimeTests
+  displayName: Run Tests?
+  type: boolean
+  default: true
+
+stages:
+- stage: webgpu
+  dependsOn: []
+  jobs:
+    - template: templates/jobs/win-ci-vs-2022-job.yml
+      parameters:
+        BuildConfig: 'RelWithDebInfo'
+        EnvSetupScript: setup_env_cuda.bat
+        buildArch: x64
+        # add --enable_pybind and --build_java if necessary
+        additionalBuildFlags: >-
+          --build_nodejs
+          --use_webgpu
+          --cmake_extra_defines onnxruntime_BUILD_UNIT_TESTS=ON
+        msbuildPlatform: x64
+        isX86: false
+        job_name_suffix: x64_RelWithDebInfo
+        RunOnnxRuntimeTests: ${{ parameters.RunOnnxRuntimeTests }}
+        ORT_EP_NAME: WebGPU
+        EnablePython: false
+        WITH_CACHE: true
+        MachinePool: onnxruntime-Win2022-VS2022-webgpu-A10
diff --git a/tools/ci_build/set-trigger-rules.py b/tools/ci_build/set-trigger-rules.py
index fb6aa44cdf31a..0e9cd514d8aa5 100644
--- a/tools/ci_build/set-trigger-rules.py
+++ b/tools/ci_build/set-trigger-rules.py
@@ -40,6 +40,7 @@
     "win-gpu-training-ci-pipeline.yml",
     "win-gpu-doc-gen-ci-pipeline.yml",
     "win-gpu-tensorrt-ci-pipeline.yml",
+    "win-gpu-webgpu-ci-pipeline.yml",
     "win-qnn-arm64-ci-pipeline.yml",
     "win-qnn-ci-pipeline.yml",
 ]

From 0f7a5f6077f0885aa32b0ede324023419badb3c2 Mon Sep 17 00:00:00 2001
From: Xu Xing <xing.xu@intel.com>
Date: Fri, 27 Sep 2024 13:49:09 +0800
Subject: [PATCH 5/6] [webgpu native] Add RotaryEmbedding op (#22194)

### Description
<!-- Describe your changes. -->



### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
---
 .../webgpu/bert/rotary_embedding.cc           | 134 ++++++++++++++++++
 .../webgpu/bert/rotary_embedding.h            |  47 ++++++
 .../webgpu/webgpu_contrib_kernels.cc          |   2 +-
 .../contrib_ops/rotary_embedding_op_test.cc   |   4 +
 4 files changed, 186 insertions(+), 1 deletion(-)
 create mode 100644 onnxruntime/contrib_ops/webgpu/bert/rotary_embedding.cc
 create mode 100644 onnxruntime/contrib_ops/webgpu/bert/rotary_embedding.h

diff --git a/onnxruntime/contrib_ops/webgpu/bert/rotary_embedding.cc b/onnxruntime/contrib_ops/webgpu/bert/rotary_embedding.cc
new file mode 100644
index 0000000000000..eb5cfad87597f
--- /dev/null
+++ b/onnxruntime/contrib_ops/webgpu/bert/rotary_embedding.cc
@@ -0,0 +1,134 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "core/providers/webgpu/shader_helper.h"
+#include "core/providers/webgpu/webgpu_supported_types.h"
+#include "contrib_ops/webgpu/webgpu_contrib_kernels.h"
+#include "contrib_ops/webgpu/bert/rotary_embedding.h"
+
+namespace onnxruntime {
+namespace contrib {
+namespace webgpu {
+
+ONNX_OPERATOR_KERNEL_EX(
+    RotaryEmbedding,
+    kMSDomain,
+    1,
+    kWebGpuExecutionProvider,
+    (*KernelDefBuilder::Create())
+        .TypeConstraint("T", WebGpuSupportedFloatTypes())
+        .TypeConstraint("M", DataTypeImpl::GetTensorType<int64_t>()),
+    RotaryEmbedding);
+
+Status RotaryEmbeddingProgram::GenerateShaderCode(ShaderHelper& shader) const {
+  const auto& input = shader.AddInput("input", ShaderUsage::UseUniform);
+  const auto& position_ids = shader.AddInput("position_ids", ShaderUsage::UseUniform);
+  const auto& cos_cache = shader.AddInput("cos_cache", ShaderUsage::UseUniform);
+  const auto& sin_cache = shader.AddInput("sin_cache", ShaderUsage::UseUniform);
+  const auto& output = shader.AddOutput("output", ShaderUsage::UseUniform);
+  // TODO: remove output_indices.
+  const auto& output_indices = shader.AddIndices("output_indices", false);
+  const auto interleaved_str = interleaved_ ? "true" : "false";
+  shader.SetMainFunctionBody(
+      "  let half_rotary_emb_dim = uniforms.cos_cache_shape[1];\n"
+      "  let bsnh = global_idx / uniforms.global_stride % uniforms.global_shape;\n"
+      "  let size = uniforms.global_shape[0] * uniforms.global_stride[0];\n",
+      "  if (global_idx >= size) { return; }\n"
+      "  if (bsnh[3] < half_rotary_emb_dim) {\n"
+      "    let position_ids_idx = " +
+          position_ids.BroadcastedIndicesToOffset("bsnh.xy", output_indices) + ";\n" +
+          "    let position_id = u32(" +
+          position_ids.GetByOffset("position_ids_idx") + ")" +
+          " + select(0, bsnh[1], position_ids_idx == 0);\n"
+          "    let i = dot(bsnh, uniforms.input_output_stride) + select(0, bsnh[3], " +
+          interleaved_str +
+          ");\n"
+          "    let j = i + select(half_rotary_emb_dim, 1, " +
+          interleaved_str +
+          ");\n"
+          "    let re = " +
+          input.GetByOffset("i") + " * " + cos_cache.GetByIndices("vec2<u32>(position_id, bsnh[3])") + "-" +
+          input.GetByOffset("j") + " * " + sin_cache.GetByIndices("vec2<u32>(position_id, bsnh[3])") + ";\n" +
+          "    " + output.SetByOffset("i", "re") + "\n" +
+          "    let im = " + input.GetByOffset("i") + " * " +
+          sin_cache.GetByIndices("vec2<u32>(position_id, bsnh[3])") +
+          "+ " + input.GetByOffset("j") +
+          " * " + cos_cache.GetByIndices("vec2<u32>(position_id, bsnh[3])") +
+          ";\n    " + output.SetByOffset("j", "im") +
+          "\n"
+          "  } else { \n"
+          "    let k = dot(bsnh, uniforms.input_output_stride) + half_rotary_emb_dim;\n" +
+          "    " + output.SetByOffset("k", input.GetByOffset("k")) +
+          "\n"
+          "  }");
+
+  return Status::OK();
+}
+
+RotaryEmbedding::RotaryEmbedding(const OpKernelInfo& info) : WebGpuKernel(info) {
+  scale_ = info.GetAttrOrDefault<float>("scale", 1.0);
+  rotary_embedding_dim_ = static_cast<int>(info.GetAttrOrDefault<int64_t>("rotary_embedding_dim", 0));
+  num_heads_ = static_cast<int>(info.GetAttrOrDefault<int64_t>("num_heads", 0));
+  interleaved_ = (info.GetAttrOrDefault<int64_t>("interleaved", 0) == 1);
+  is_packed_batching_ = (info.GetAttrOrDefault<int64_t>("is_packed_batching", 0) == 1);
+}
+
+Status RotaryEmbedding::ComputeInternal(onnxruntime::webgpu::ComputeContext& context) const {
+  const auto* input = context.Input<Tensor>(0);
+  const auto input_shape = input->Shape();
+  const auto* position_ids = context.Input<Tensor>(1);
+  const auto* cos_cache = context.Input<Tensor>(2);
+  const auto* sin_cache = context.Input<Tensor>(3);
+  auto* output = context.Output(0, input_shape);
+
+  const auto batch_size = gsl::narrow_cast<uint32_t>(input->Shape()[0]);
+  const auto batch_stride = gsl::narrow_cast<uint32_t>(input_shape.SizeFromDimension(1));
+  const auto sequence_length = gsl::narrow_cast<uint32_t>(input_shape[input_shape.NumDimensions() - 2]);
+  const auto hidden_size = batch_stride / sequence_length;
+  const auto half_rotary_embedding_dim = gsl::narrow_cast<uint32_t>(cos_cache->Shape()[1]);
+  const auto head_size = rotary_embedding_dim_ == 0 ? half_rotary_embedding_dim * 2 : hidden_size / num_heads_;
+
+  // Rotary embeddings will be calculated in a pair-wise fashion. In accordance, use the shape
+  // [batch size, sequence length, num of heads, num of pairs to rotate + num of dims to copy]
+  // to unfold the global index in shader.
+  const TensorShape global_shape({batch_size,
+                                  sequence_length,
+                                  hidden_size / head_size,
+                                  head_size - half_rotary_embedding_dim});
+
+  const auto rank = global_shape.NumDimensions();
+  std::vector<uint32_t> global_dims(rank);
+  std::vector<uint32_t> global_strides(rank);
+  for (size_t j = 0; j < rank; ++j) {
+    global_dims[j] = gsl::narrow_cast<uint32_t>(global_shape[j]);
+    global_strides[j] = gsl::narrow_cast<uint32_t>(global_shape.SizeFromDimension(j + 1));
+  }
+
+  const auto output_size = gsl::narrow_cast<const uint32_t>(global_shape.Size());
+  RotaryEmbeddingProgram program{interleaved_};
+  const auto input_output_strides =
+      input_shape.NumDimensions() == 3
+          ? std::vector<uint32_t>({batch_stride, hidden_size, head_size, 1})
+          : (input_shape.NumDimensions() == 4
+                 ? std::vector<uint32_t>({batch_stride, head_size, sequence_length * head_size, 1})
+                 : std::vector<uint32_t>({}));
+
+  program
+      .CacheHint(interleaved_)
+      .AddInputs({{input, ProgramTensorMetadataDependency::Rank},
+                  {position_ids, ProgramTensorMetadataDependency::Rank},
+                  {cos_cache, ProgramTensorMetadataDependency::Rank},
+                  {sin_cache, ProgramTensorMetadataDependency::Rank}})
+      .AddOutput({output, ProgramTensorMetadataDependency::None})
+      .SetDispatchGroupSize((output_size + WORKGROUP_SIZE - 1) / WORKGROUP_SIZE)
+      .AddUniformVariables({{scale_},
+                            {gsl::make_span(global_dims)},
+                            {gsl::make_span(global_strides)},
+                            {gsl::make_span(input_output_strides)}})
+      .AddIndices(TensorShape{1, 1});
+  return context.RunProgram(program);
+}
+
+}  // namespace webgpu
+}  // namespace contrib
+}  // namespace onnxruntime
diff --git a/onnxruntime/contrib_ops/webgpu/bert/rotary_embedding.h b/onnxruntime/contrib_ops/webgpu/bert/rotary_embedding.h
new file mode 100644
index 0000000000000..0d73b89fb62df
--- /dev/null
+++ b/onnxruntime/contrib_ops/webgpu/bert/rotary_embedding.h
@@ -0,0 +1,47 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "core/providers/webgpu/program.h"
+#include "core/providers/webgpu/webgpu_kernel.h"
+
+namespace onnxruntime {
+namespace contrib {
+namespace webgpu {
+
+using namespace onnxruntime::webgpu;
+using onnxruntime::webgpu::ComputeContext;
+
+class RotaryEmbeddingProgram final : public Program<RotaryEmbeddingProgram> {
+ public:
+  RotaryEmbeddingProgram(bool interleaved) : Program{"RotaryEmbedding"}, interleaved_{interleaved} {
+  }
+
+  Status GenerateShaderCode(ShaderHelper& sh) const override;
+
+  WEBGPU_PROGRAM_DEFINE_UNIFORM_VARIABLES({"scale", ProgramUniformVariableDataType::Float32},
+                                          {"global_shape", ProgramUniformVariableDataType::Uint32},
+                                          {"global_stride", ProgramUniformVariableDataType::Uint32},
+                                          {"input_output_stride", ProgramUniformVariableDataType::Uint32});
+
+ private:
+  const bool interleaved_;
+};
+
+class RotaryEmbedding final : public WebGpuKernel {
+ public:
+  RotaryEmbedding(const OpKernelInfo& info);
+  Status ComputeInternal(ComputeContext& context) const override;
+
+ private:
+  float scale_;
+  int num_heads_;
+  int rotary_embedding_dim_;
+  bool interleaved_;
+  bool is_packed_batching_;
+};
+
+}  // namespace webgpu
+}  // namespace contrib
+}  // namespace onnxruntime
diff --git a/onnxruntime/contrib_ops/webgpu/webgpu_contrib_kernels.cc b/onnxruntime/contrib_ops/webgpu/webgpu_contrib_kernels.cc
index def104b6cb108..01c8a28d45069 100644
--- a/onnxruntime/contrib_ops/webgpu/webgpu_contrib_kernels.cc
+++ b/onnxruntime/contrib_ops/webgpu/webgpu_contrib_kernels.cc
@@ -47,7 +47,7 @@ Status RegisterWebGpuContribKernels(KernelRegistry& kernel_registry) {
       // BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kMSDomain, 1, MatMulNBits)>,
       // BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kMSDomain, 1, MultiHeadAttention)>,
       // BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kMSDomain, 1, QuickGelu)>,
-      // BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kMSDomain, 1, RotaryEmbedding)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kMSDomain, 1, RotaryEmbedding)>,
       // BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kMSDomain, 1,
       //                                                       SkipLayerNormalization)>,
       // BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 1,
diff --git a/onnxruntime/test/contrib_ops/rotary_embedding_op_test.cc b/onnxruntime/test/contrib_ops/rotary_embedding_op_test.cc
index 89552da58b938..cb44427cf0b12 100644
--- a/onnxruntime/test/contrib_ops/rotary_embedding_op_test.cc
+++ b/onnxruntime/test/contrib_ops/rotary_embedding_op_test.cc
@@ -67,6 +67,7 @@ static void RunTest(
                                                                       : 0;
   bool enable_cuda = HasCudaEnvironment(min_cuda_architecture);
   bool enable_dml = (nullptr != DefaultDmlExecutionProvider().get()) && !disable_dml;
+  bool enable_webgpu = nullptr != DefaultWebGpuExecutionProvider().get();
 
   if (enable_cuda && !disable_cuda) {
     execution_providers.push_back(DefaultCudaExecutionProvider());
@@ -77,6 +78,9 @@ static void RunTest(
   if (tensor_type == TensorType::kFloat && !disable_cpu) {
     execution_providers.push_back(DefaultCpuExecutionProvider());
   }
+  if (enable_webgpu) {
+    execution_providers.push_back(DefaultWebGpuExecutionProvider());
+  }
   if (execution_providers.size() == 0) {
     // Return early if CI pipeline does not support EP (e.g. CUDA EP for CPU CI pipeline)
     return;

From 41f6ff37fb4e96778a0626b6101d3e7fc66895cf Mon Sep 17 00:00:00 2001
From: Xu Xing <xing.xu@intel.com>
Date: Fri, 27 Sep 2024 14:57:21 +0800
Subject: [PATCH 6/6] [webgpu native] Add transpose shared (#22098)

### Description
<!-- Describe your changes. -->



### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
---
 .../core/providers/webgpu/tensor/transpose.cc | 91 +++++++++++++++----
 .../core/providers/webgpu/tensor/transpose.h  | 24 ++---
 2 files changed, 87 insertions(+), 28 deletions(-)

diff --git a/onnxruntime/core/providers/webgpu/tensor/transpose.cc b/onnxruntime/core/providers/webgpu/tensor/transpose.cc
index 0962d9191d785..e0a0113e13224 100644
--- a/onnxruntime/core/providers/webgpu/tensor/transpose.cc
+++ b/onnxruntime/core/providers/webgpu/tensor/transpose.cc
@@ -47,11 +47,11 @@ ONNX_OPERATOR_KERNEL_EX(
         .TypeConstraint("T", WebGpuSupportedNumberTypes()),
     Transpose);
 
-const std::string AppendPermFunction(gsl::span<const size_t> perm) {
+const std::string AppendPermFunction(gsl::span<const int64_t> perm) {
   std::ostringstream ss;
   ss.imbue(std::locale::classic());
-  ss << "fn perm(i: y_indices_t)->x_indices_t {\n"
-        "  var a: x_indices_t;\n";
+  ss << "fn perm(i: output_indices_t)->a_indices_t {\n"
+        "  var a: a_indices_t;\n";
   for (size_t i = 0; i < perm.size(); ++i) {
     ss << "  a[" << perm[i] << "] = i[" << i << "];\n";
   }
@@ -60,21 +60,52 @@ const std::string AppendPermFunction(gsl::span<const size_t> perm) {
   return ss.str();
 }
 
+auto SqueezeShape(const gsl::span<const int64_t>& shape, const gsl::span<const size_t>& adjusted_perm, InlinedVector<int64_t>& new_shape, InlinedVector<int64_t>& new_perm) {
+  for (auto i = 0; i < shape.size(); ++i) {
+    if (shape[i] != 1) {
+      new_shape.push_back(shape[i]);
+    }
+    if (shape[adjusted_perm[i]] != 1) {
+      new_perm.push_back(adjusted_perm[i]);
+    }
+  }
+};
+
 Status TransposeProgram::GenerateShaderCode(ShaderHelper& shader) const {
-  const auto& input = shader.AddInput("x", ShaderUsage::UseUniform | ShaderUsage::UseIndicesTypeAlias);
-  const auto& output = shader.AddOutput("y", ShaderUsage::UseUniform | ShaderUsage::UseIndicesTypeAlias);
-  shader.AppendImplementation(AppendPermFunction(this->perm_));
-  shader.SetMainFunctionBody(shader.GuardAgainstOutOfBoundsWorkgroupSizes("uniforms.output_size"),
-                             "  let indices = ", output.OffsetToIndices("global_idx"),
-                             ";\n"
-                             "  let x_indices = perm(indices); \n"
-                             "  ",
-                             output.SetByOffset("global_idx", input.GetByIndices("x_indices")));
+  const auto& input = shader.AddInput("a", ShaderUsage::UseUniform | ShaderUsage::UseIndicesTypeAlias);
+  const auto& output = shader.AddOutput("output", ShaderUsage::UseUniform | ShaderUsage::UseIndicesTypeAlias | ShaderUsage::UseValueTypeAlias);
+
+  if (use_shared_) {
+    shader.AppendImplementation("var<workgroup> tile : array<array<output_value_t, tile_size + 1>, tile_size>;\n");
+    shader.SetMainFunctionBody(
+        "  let stride = (uniforms.output_shape[1] - 1) / tile_size + 1;\n"
+        "  let workgroup_id_x = workgroup_idx % stride;\n"
+        "  let workgroup_id_y = workgroup_idx / stride;\n"
+        "  let input_col = workgroup_id_y * tile_size + local_id.x;\n"
+        "  let input_row = workgroup_id_x * tile_size + local_id.y;\n"
+        "  if (input_row < uniforms.a_shape[0] && input_col < uniforms.a_shape[1]) {\n"
+        "    tile[local_id.y][local_id.x] = " +
+        input.GetByIndices("a_indices_t(input_row, input_col)") +
+        ";\n"
+        "  }\n"
+        "  workgroupBarrier();\n"
+        "  let output_col = workgroup_id_x * tile_size + local_id.x;\n"
+        "  let output_row = workgroup_id_y * tile_size + local_id.y;\n"
+        "  if (output_row < uniforms.output_shape[0] && output_col < uniforms.output_shape[1]) {\n    " +
+        output.SetByIndices("output_indices_t(output_row, output_col)", "tile[local_id.x][local_id.y]") + "\n  }");
+  } else {
+    shader.AppendImplementation(AppendPermFunction(this->perm_));
+    shader.SetMainFunctionBody(shader.GuardAgainstOutOfBoundsWorkgroupSizes("uniforms.output_size"),
+                               "  let indices = ", output.OffsetToIndices("global_idx"),
+                               ";\n"
+                               "  let x_indices = perm(indices);\n",
+                               "  ",
+                               output.SetByOffset("global_idx", input.GetByIndices("x_indices")));
+  }
   return Status::OK();
 }
 
 Status Transpose::ComputeInternal(ComputeContext& context) const {
-  // TODO: there is an optimized version of transpose to port.
   const auto* input_tensor = context.Input(0);
   const TensorShape& input_shape = input_tensor->Shape();
   int32_t rank = gsl::narrow_cast<int32_t>(input_shape.NumDimensions());
@@ -86,16 +117,42 @@ Status Transpose::ComputeInternal(ComputeContext& context) const {
   TensorShape output_shape(output_dims);
   auto* output_tensor = context.Output(0, output_shape);
 
+  InlinedVector<int64_t> new_shape{};
+  InlinedVector<int64_t> new_perm{};
+  SqueezeShape(input_shape.GetDims(), *p_perm, new_shape, new_perm);
+  const bool channels_last = new_perm == InlinedVector<int64_t>({2, 3, 1});
+  const bool channels_first = new_perm == InlinedVector<int64_t>({3, 1, 2});
+  const bool use_shared = (new_shape.size() == 2 && new_perm[0] > new_perm[1]) || channels_last || channels_first;
+  auto new_input_shape = input_shape;
+  TensorShape new_output_shape(output_dims);
+  if (use_shared) {
+    new_input_shape = channels_last
+                          ? TensorShape({new_shape[0], new_shape[1] * new_shape[2]})
+                      : channels_first
+                          ? TensorShape({new_shape[0] * new_shape[1], new_shape[2]})
+                          : new_shape;
+    new_output_shape = TensorShape({new_input_shape[1], new_input_shape[0]});
+  }
+
   uint32_t output_size = gsl::narrow_cast<int32_t>(input_tensor->Shape().Size());
-  TransposeProgram program{*p_perm};
+  TransposeProgram program{*p_perm, use_shared};
+  if (use_shared) {
+    program.SetWorkgroupSize(TILE_SIZE, TILE_SIZE, 1);
+  }
+
   program
       .CacheHint(absl::StrJoin(*p_perm, "-"))
-      .AddInputs({{input_tensor, ProgramTensorMetadataDependency::TypeAndRank}})
-      .AddOutputs({output_tensor})
-      .SetDispatchGroupSize((output_size + WORKGROUP_SIZE - 1) / WORKGROUP_SIZE)
+      .AddInputs({{input_tensor, ProgramTensorMetadataDependency::TypeAndRank, new_input_shape, 1}})
+      .AddOutputs({{output_tensor, ProgramTensorMetadataDependency::None, new_output_shape, 1}})
+      .SetDispatchGroupSize(static_cast<uint32_t>((new_output_shape[1] + TILE_SIZE - 1) / TILE_SIZE),
+                            static_cast<uint32_t>(((new_output_shape[0] + TILE_SIZE - 1) / TILE_SIZE)))
       .AddUniformVariables({
           {static_cast<uint32_t>(output_size)},
       });
+
+  use_shared ? program.SetDispatchGroupSize(static_cast<uint32_t>((new_output_shape[1] + TILE_SIZE - 1) / TILE_SIZE),
+                                            static_cast<uint32_t>(((new_output_shape[0] + TILE_SIZE - 1) / TILE_SIZE)))
+             : program.SetDispatchGroupSize((output_size + WORKGROUP_SIZE - 1) / WORKGROUP_SIZE);
   return context.RunProgram(program);
 }
 
diff --git a/onnxruntime/core/providers/webgpu/tensor/transpose.h b/onnxruntime/core/providers/webgpu/tensor/transpose.h
index 3ca5674d5dfab..7cf5c1fe0865d 100644
--- a/onnxruntime/core/providers/webgpu/tensor/transpose.h
+++ b/onnxruntime/core/providers/webgpu/tensor/transpose.h
@@ -11,26 +11,28 @@
 namespace onnxruntime {
 namespace webgpu {
 
+class Transpose final : public WebGpuKernel, public TransposeBase {
+ public:
+  Transpose(const OpKernelInfo& info) : WebGpuKernel{info}, TransposeBase{info} {
+  }
+  Status ComputeInternal(ComputeContext& context) const override;
+  constexpr static uint32_t TILE_SIZE = 16;
+};
+
 class TransposeProgram final : public Program<TransposeProgram> {
  public:
-  TransposeProgram(const gsl::span<const size_t>& permutations)
-      : Program{"Transpose"}, perm_(permutations.begin(), permutations.end()) {
+  TransposeProgram(const gsl::span<const size_t>& permutations, bool use_shared)
+      : Program{"Transpose"}, perm_(permutations.begin(), permutations.end()), use_shared_(use_shared) {
   }
 
   Status GenerateShaderCode(ShaderHelper& sh) const override;
 
   WEBGPU_PROGRAM_DEFINE_UNIFORM_VARIABLES({"output_size", ProgramUniformVariableDataType::Uint32});
+  WEBGPU_PROGRAM_DEFINE_CONSTANTS({"tile_size", Transpose::TILE_SIZE});
 
  private:
-  InlinedVector<size_t> perm_;
-};
-
-class Transpose final : public WebGpuKernel, public TransposeBase {
- public:
-  Transpose(const OpKernelInfo& info) : WebGpuKernel{info}, TransposeBase{info} {
-  }
-
-  Status ComputeInternal(ComputeContext& context) const override;
+  InlinedVector<int64_t> perm_;
+  const bool use_shared_;
 };
 
 }  // namespace webgpu