From e1b8ce4e360c64f6d8e81230b535c396d7d18833 Mon Sep 17 00:00:00 2001
From: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date: Tue, 8 Oct 2024 16:10:46 -0700
Subject: [PATCH] Initial WebGPU EP checkin (#22318)

This change introduces the WebGPU EP into ONNX Runtime.

To make the PR as simple as possible, this PR excluded the following:
- C API changes for WebGPU EP
- actual implementation of WebGPU EP. Currently in this PR, WebGPU is a
stub implementation that does not register any kernel.
- Python IO Binding update
- Node.js IO Binding update

This PR now contains only 43 file changes (while the working branch
contains 130+) and hopefully this makes it easier to review.

There is going to be separated PRs for each mentioned above.

Current working branch: #21904
---
 cmake/CMakeLists.txt                          |  22 ++++
 .../external/onnxruntime_external_deps.cmake  |  82 ++++++++++---
 cmake/onnxruntime.cmake                       |  67 ++++++++++-
 cmake/onnxruntime_nodejs.cmake                |   5 +-
 cmake/onnxruntime_objectivec.cmake            |  14 ---
 cmake/onnxruntime_providers.cmake             |   7 ++
 cmake/onnxruntime_providers_cpu.cmake         |   5 +
 cmake/onnxruntime_providers_webgpu.cmake      |  27 +++++
 cmake/onnxruntime_python.cmake                |   1 +
 cmake/onnxruntime_unittests.cmake             |  12 ++
 cmake/patches/dawn/dawn.patch                 |  66 +++++++++++
 include/onnxruntime/core/graph/constants.h    |   1 +
 .../webgpu/webgpu_provider_factory.h          |  14 +++
 .../main/java/ai/onnxruntime/OrtProvider.java |   4 +-
 .../webgpu/webgpu_contrib_kernels.cc          |  34 ++++++
 .../webgpu/webgpu_contrib_kernels.h           |  20 ++++
 .../platform/apple/logging/apple_log_sink.mm  |   2 -
 .../core/providers/get_execution_providers.cc |   8 ++
 .../providers/provider_factory_creators.h     |   4 +
 onnxruntime/core/providers/webgpu/symbols.txt |   0
 .../webgpu/webgpu_execution_provider.cc       | 108 ++++++++++++++++++
 .../webgpu/webgpu_execution_provider.h        |  37 ++++++
 .../webgpu/webgpu_provider_factory.cc         |  24 ++++
 .../webgpu/webgpu_provider_factory_creator.h  |  18 +++
 onnxruntime/core/session/inference_session.cc |   8 +-
 .../core/session/provider_registration.cc     |   6 +
 .../python/onnxruntime_pybind_state.cc        |   4 +
 onnxruntime/test/onnx/main.cc                 |  62 +++++++++-
 .../test/perftest/command_args_parser.cc      |   6 +-
 onnxruntime/test/perftest/ort_test_session.cc |   7 ++
 .../ios_package_uitest_cpp_api.mm             |  23 +++-
 .../macos_package_uitest_cpp_api.mm           |  24 +++-
 onnxruntime/test/providers/base_tester.cc     |   3 +
 .../providers/compare_provider_test_utils.cc  |   2 +
 onnxruntime/test/util/default_providers.cc    |   9 ++
 .../test/util/include/default_providers.h     |   1 +
 tools/ci_build/build.py                       |   5 +
 tools/ci_build/gen_def.py                     |   1 +
 .../apple/build_and_assemble_apple_pods.py    |   2 +
 .../win-gpu-webgpu-ci-pipeline.yml            |  58 ++++++++++
 tools/ci_build/set-trigger-rules.py           |   1 +
 41 files changed, 748 insertions(+), 56 deletions(-)
 create mode 100644 cmake/onnxruntime_providers_webgpu.cmake
 create mode 100644 cmake/patches/dawn/dawn.patch
 create mode 100644 include/onnxruntime/core/providers/webgpu/webgpu_provider_factory.h
 create mode 100644 onnxruntime/contrib_ops/webgpu/webgpu_contrib_kernels.cc
 create mode 100644 onnxruntime/contrib_ops/webgpu/webgpu_contrib_kernels.h
 create mode 100644 onnxruntime/core/providers/webgpu/symbols.txt
 create mode 100644 onnxruntime/core/providers/webgpu/webgpu_execution_provider.cc
 create mode 100644 onnxruntime/core/providers/webgpu/webgpu_execution_provider.h
 create mode 100644 onnxruntime/core/providers/webgpu/webgpu_provider_factory.cc
 create mode 100644 onnxruntime/core/providers/webgpu/webgpu_provider_factory_creator.h
 create mode 100644 tools/ci_build/github/azure-pipelines/win-gpu-webgpu-ci-pipeline.yml

diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt
index d56b1cd3f5201..ec536205b228e 100644
--- a/cmake/CMakeLists.txt
+++ b/cmake/CMakeLists.txt
@@ -148,6 +148,7 @@ option(onnxruntime_TVM_USE_LLVM "Build TVM with LLVM. Set customized path to llv
 option(onnxruntime_TVM_USE_HASH "Build ipp-crypto library for support hash algorithm. It is defined for TVM only")
 option(onnxruntime_USE_XNNPACK "Build with XNNPACK support. Provides an alternative math library on ARM, WebAssembly and x86." OFF)
 option(onnxruntime_USE_WEBNN "Build with WebNN support. Enable hardware acceleration in web browsers." OFF)
+option(onnxruntime_USE_WEBGPU "Build with WebGPU support. Enable WebGPU via C/C++ interface." OFF)
 
 # Options related to reducing the binary size produced by the build
 # XNNPACK EP requires the internal NHWC contrib ops to be available, so this option must be OFF when onnxruntime_USE_XNNPACK is ON
@@ -490,6 +491,22 @@ if (onnxruntime_BUILD_CSHARP)
   endif()
 endif()
 
+if (onnxruntime_BUILD_OBJC)
+  check_language(OBJC)
+  if(CMAKE_OBJC_COMPILER)
+      enable_language(OBJC)
+  else()
+      message(FATAL_ERROR "Objective-C is not supported.")
+  endif()
+
+  check_language(OBJCXX)
+  if(CMAKE_OBJCXX_COMPILER)
+      enable_language(OBJCXX)
+  else()
+      message(FATAL_ERROR "Objective-C++ is not supported.")
+  endif()
+endif()
+
 if (NOT WIN32)
   #TODO: On Linux we may try https://github.com/microsoft/TraceLogging.git
   if (onnxruntime_ENABLE_INSTRUMENT)
@@ -917,6 +934,11 @@ if (onnxruntime_USE_WEBNN)
   list(APPEND ORT_PROVIDER_CMAKE_FLAGS -Donnxruntime_USE_WEBNN=1)
   list(APPEND ONNXRUNTIME_PROVIDER_NAMES webnn)
 endif()
+if (onnxruntime_USE_WEBGPU)
+  list(APPEND ORT_PROVIDER_FLAGS -DUSE_WEBGPU=1)
+  list(APPEND ORT_PROVIDER_CMAKE_FLAGS -Donnxruntime_USE_WEBGPU=1)
+  list(APPEND ONNXRUNTIME_PROVIDER_NAMES webgpu)
+endif()
 if (onnxruntime_USE_CANN)
     list(APPEND ORT_PROVIDER_FLAGS  -DUSE_CANN=1)
     list(APPEND ORT_PROVIDER_CMAKE_FLAGS -Donnxruntime_USE_CANN=1)
diff --git a/cmake/external/onnxruntime_external_deps.cmake b/cmake/external/onnxruntime_external_deps.cmake
index cb737ee53639f..85746027d4e8c 100644
--- a/cmake/external/onnxruntime_external_deps.cmake
+++ b/cmake/external/onnxruntime_external_deps.cmake
@@ -576,10 +576,11 @@ if (onnxruntime_USE_MIMALLOC)
   onnxruntime_fetchcontent_makeavailable(mimalloc)
 endif()
 
-#onnxruntime_EXTERNAL_LIBRARIES could contain onnx, onnx_proto,libprotobuf, cuda/cudnn,
-# dnnl/mklml, onnxruntime_codegen_tvm, tvm and pthread
-# pthread is always at the last
-set(onnxruntime_EXTERNAL_LIBRARIES ${onnxruntime_EXTERNAL_LIBRARIES_XNNPACK} ${WIL_TARGET} nlohmann_json::nlohmann_json onnx onnx_proto ${PROTOBUF_LIB} re2::re2 Boost::mp11 safeint_interface flatbuffers::flatbuffers ${GSL_TARGET} ${ABSEIL_LIBS} date::date ${ONNXRUNTIME_CLOG_TARGET_NAME})
+set(onnxruntime_EXTERNAL_LIBRARIES ${onnxruntime_EXTERNAL_LIBRARIES_XNNPACK} ${WIL_TARGET} nlohmann_json::nlohmann_json
+                                   onnx onnx_proto ${PROTOBUF_LIB} re2::re2 Boost::mp11 safeint_interface
+                                   flatbuffers::flatbuffers ${GSL_TARGET} ${ABSEIL_LIBS} date::date
+                                   ${ONNXRUNTIME_CLOG_TARGET_NAME})
+
 # The source code of onnx_proto is generated, we must build this lib first before starting to compile the other source code that uses ONNX protobuf types.
 # The other libs do not have the problem. All the sources are already there. We can compile them in any order.
 set(onnxruntime_EXTERNAL_DEPENDENCIES onnx_proto flatbuffers::flatbuffers)
@@ -634,24 +635,73 @@ if (onnxruntime_USE_COREML)
   FetchContent_Populate(coremltools)
 endif()
 
-message(STATUS "Finished fetching external dependencies")
+if (onnxruntime_USE_WEBGPU)
+  FetchContent_Declare(
+    dawn
+    URL ${DEP_URL_dawn}
+    URL_HASH SHA1=${DEP_SHA1_dawn}
+    PATCH_COMMAND ${Patch_EXECUTABLE} --binary --ignore-whitespace -p1 < ${PROJECT_SOURCE_DIR}/patches/dawn/dawn.patch
+  )
 
-set(onnxruntime_LINK_DIRS )
+  # use dawn::dawn_native and dawn::dawn_proc instead of the monolithic dawn::webgpu_dawn to minimize binary size
+  set(DAWN_BUILD_MONOLITHIC_LIBRARY OFF CACHE BOOL "" FORCE)
+  set(DAWN_BUILD_SAMPLES OFF CACHE BOOL "" FORCE)
+  set(DAWN_ENABLE_INSTALL OFF CACHE BOOL "" FORCE)
+  set(DAWN_ENABLE_NULL OFF CACHE BOOL "" FORCE)
+  set(DAWN_FETCH_DEPENDENCIES ON CACHE BOOL "" FORCE)
+
+  # disable things we don't use
+  set(DAWN_DXC_ENABLE_ASSERTS_IN_NDEBUG OFF)
+  set(DAWN_ENABLE_DESKTOP_GL OFF CACHE BOOL "" FORCE)
+  set(DAWN_ENABLE_OPENGLES OFF CACHE BOOL "" FORCE)
+  set(DAWN_SUPPORTS_GLFW_FOR_WINDOWING OFF CACHE BOOL "" FORCE)
+  set(DAWN_USE_GLFW OFF CACHE BOOL "" FORCE)
+  set(DAWN_USE_WINDOWS_UI OFF CACHE BOOL "" FORCE)
+  set(DAWN_USE_X11 OFF CACHE BOOL "" FORCE)
+
+  set(TINT_BUILD_TESTS OFF CACHE BOOL "" FORCE)
+  set(TINT_BUILD_CMD_TOOLS OFF CACHE BOOL "" FORCE)
+  set(TINT_BUILD_GLSL_WRITER OFF CACHE BOOL "" FORCE)
+  set(TINT_BUILD_GLSL_VALIDATOR OFF CACHE BOOL "" FORCE)
+  set(TINT_BUILD_IR_BINARY OFF CACHE BOOL "" FORCE)
+  set(TINT_BUILD_SPV_READER OFF CACHE BOOL "" FORCE)  # don't need. disabling is a large binary size saving
+  set(TINT_BUILD_WGSL_WRITER ON CACHE BOOL "" FORCE)  # needed to create cache key. runtime error if not enabled.
+
+  # SPIR-V validation shouldn't be required given we're using Tint to create the SPIR-V.
+  set(DAWN_ENABLE_SPIRV_VALIDATION OFF CACHE BOOL "" FORCE)
+
+  if (WIN32)
+    # building this requires the HLSL writer to be enabled in Tint. TBD if that we need either of these to be ON.
+    set(DAWN_USE_BUILT_DXC ON CACHE BOOL "" FORCE)
+    set(TINT_BUILD_HLSL_WRITER ON CACHE BOOL "" FORCE)
+
+    # Vulkan may optionally be included in a Windows build. Exclude until we have an explicit use case that requires it.
+    set(DAWN_ENABLE_VULKAN OFF CACHE BOOL "" FORCE)
+  endif()
+
+  onnxruntime_fetchcontent_makeavailable(dawn)
 
+  list(APPEND onnxruntime_EXTERNAL_LIBRARIES dawn::dawn_native dawn::dawn_proc)
+endif()
+
+set(onnxruntime_LINK_DIRS)
 if (onnxruntime_USE_CUDA)
-      find_package(CUDAToolkit REQUIRED)
+  find_package(CUDAToolkit REQUIRED)
 
-      if(onnxruntime_CUDNN_HOME)
-        file(TO_CMAKE_PATH ${onnxruntime_CUDNN_HOME} onnxruntime_CUDNN_HOME)
-        set(CUDNN_PATH ${onnxruntime_CUDNN_HOME})
-      endif()
-      include(cuDNN)
+  if(onnxruntime_CUDNN_HOME)
+    file(TO_CMAKE_PATH ${onnxruntime_CUDNN_HOME} onnxruntime_CUDNN_HOME)
+    set(CUDNN_PATH ${onnxruntime_CUDNN_HOME})
+  endif()
+
+  include(cuDNN)
 endif()
 
 if(onnxruntime_USE_SNPE)
-    include(external/find_snpe.cmake)
-    list(APPEND onnxruntime_EXTERNAL_LIBRARIES ${SNPE_NN_LIBS})
+  include(external/find_snpe.cmake)
+  list(APPEND onnxruntime_EXTERNAL_LIBRARIES ${SNPE_NN_LIBS})
 endif()
 
-FILE(TO_NATIVE_PATH ${CMAKE_BINARY_DIR}  ORT_BINARY_DIR)
-FILE(TO_NATIVE_PATH ${PROJECT_SOURCE_DIR}  ORT_SOURCE_DIR)
+FILE(TO_NATIVE_PATH ${CMAKE_BINARY_DIR} ORT_BINARY_DIR)
+FILE(TO_NATIVE_PATH ${PROJECT_SOURCE_DIR} ORT_SOURCE_DIR)
+
+message(STATUS "Finished fetching external dependencies")
diff --git a/cmake/onnxruntime.cmake b/cmake/onnxruntime.cmake
index 5ce5d95541d8f..c5c991d66878e 100644
--- a/cmake/onnxruntime.cmake
+++ b/cmake/onnxruntime.cmake
@@ -90,10 +90,22 @@ elseif(onnxruntime_BUILD_APPLE_FRAMEWORK)
   # create Info.plist for the framework and podspec for CocoaPods (optional)
   set(MACOSX_FRAMEWORK_NAME "onnxruntime")
   set(MACOSX_FRAMEWORK_IDENTIFIER "com.microsoft.onnxruntime")
-  # Need to include CoreML as a weaklink for CocoaPods package if the EP is enabled
+
+  # Setup weak frameworks for macOS/iOS. 'weak' as the CoreML or WebGPU EPs are optionally enabled.
   if(onnxruntime_USE_COREML)
-    set(APPLE_WEAK_FRAMEWORK "\\\"CoreML\\\"")
+    list(APPEND _weak_frameworks "\\\"CoreML\\\"")
+  endif()
+
+  if(onnxruntime_USE_WEBGPU)
+    list(APPEND _weak_frameworks "\\\"QuartzCore\\\"")
+    list(APPEND _weak_frameworks "\\\"IOSurface\\\"")
+    list(APPEND _weak_frameworks "\\\"Metal\\\"")
   endif()
+
+  if (_weak_frameworks)
+    string(JOIN ", " APPLE_WEAK_FRAMEWORK ${_weak_frameworks})
+  endif()
+
   set(INFO_PLIST_PATH "${CMAKE_CURRENT_BINARY_DIR}/Info.plist")
   configure_file(${REPO_ROOT}/cmake/Info.plist.in ${INFO_PLIST_PATH})
   configure_file(
@@ -202,6 +214,7 @@ set(onnxruntime_INTERNAL_LIBRARIES
   ${PROVIDERS_RKNPU}
   ${PROVIDERS_VSINPU}
   ${PROVIDERS_XNNPACK}
+  ${PROVIDERS_WEBGPU}
   ${PROVIDERS_WEBNN}
   ${PROVIDERS_AZURE}
   ${PROVIDERS_INTERNAL_TESTING}
@@ -366,16 +379,58 @@ if(onnxruntime_BUILD_APPLE_FRAMEWORK)
     endif()
   endforeach()
 
+  # helper function that recurses to also handle static library dependencies of the ORT external libraries
+  set(_processed_libs)  # keep track of processed libraries to skip any duplicate dependencies
+  function(add_symlink_for_static_lib_and_dependencies lib)
+    function(process cur_target)
+      # de-alias if applicable so a consistent target name is used
+      get_target_property(alias ${cur_target} ALIASED_TARGET)
+      if(TARGET ${alias})
+        set(cur_target ${alias})
+      endif()
+
+      if(${cur_target} IN_LIST _processed_libs OR ${cur_target} IN_LIST lib_and_dependencies)
+        return()
+      endif()
+
+      list(APPEND lib_and_dependencies ${cur_target})
+
+      get_target_property(link_libraries ${cur_target} LINK_LIBRARIES)
+      foreach(dependency ${link_libraries})
+        if(TARGET ${dependency})
+          process(${dependency})
+        endif()
+      endforeach()
+
+      set(lib_and_dependencies ${lib_and_dependencies} PARENT_SCOPE)
+    endfunction()
+
+    set(lib_and_dependencies)
+    process(${lib})
+
+    foreach(_target ${lib_and_dependencies})
+      get_target_property(type ${_target} TYPE)
+      if(${type} STREQUAL "STATIC_LIBRARY")
+        # message(STATUS "Adding symlink for ${_target}")
+        add_custom_command(TARGET onnxruntime POST_BUILD
+                           COMMAND ${CMAKE_COMMAND} -E create_symlink
+                             $<TARGET_FILE:${_target}> ${STATIC_LIB_DIR}/$<TARGET_LINKER_FILE_NAME:${_target}>)
+      endif()
+    endforeach()
+
+    list(APPEND _processed_libs ${lib_and_dependencies})
+    set(_processed_libs ${_processed_libs} PARENT_SCOPE)
+  endfunction()
+
   # for external libraries we create a symlink to the .a file
   foreach(_LIB ${onnxruntime_EXTERNAL_LIBRARIES})
-    if(NOT TARGET ${_LIB}) # if we didn't build from source. it may not a target
+    if(NOT TARGET ${_LIB}) # if we didn't build from source it may not be a target
       continue()
     endif()
+
     GET_TARGET_PROPERTY(_LIB_TYPE ${_LIB} TYPE)
     if(_LIB_TYPE STREQUAL "STATIC_LIBRARY")
-      add_custom_command(TARGET onnxruntime POST_BUILD
-                         COMMAND ${CMAKE_COMMAND} -E create_symlink
-                           $<TARGET_FILE:${_LIB}> ${STATIC_LIB_DIR}/$<TARGET_LINKER_FILE_NAME:${_LIB}>)
+      add_symlink_for_static_lib_and_dependencies(${_LIB})
     endif()
   endforeach()
 
diff --git a/cmake/onnxruntime_nodejs.cmake b/cmake/onnxruntime_nodejs.cmake
index f11928c11cf14..376d895be34a9 100644
--- a/cmake/onnxruntime_nodejs.cmake
+++ b/cmake/onnxruntime_nodejs.cmake
@@ -67,6 +67,9 @@ endif()
 if (onnxruntime_USE_DML)
     set(NODEJS_BINDING_USE_DML "--use_dml")
 endif()
+if (onnxruntime_USE_WEBGPU)
+    set(NODEJS_BINDING_USE_WEBGPU "--use_webgpu")
+endif()
 if (onnxruntime_USE_TENSORRT)
     set(NODEJS_BINDING_USE_TENSORRT "--use_tensorrt")
 endif()
@@ -92,7 +95,7 @@ add_custom_target(js_common_npm_ci ALL
 add_custom_target(nodejs_binding_wrapper ALL
     COMMAND ${NPM_CLI} ci
     COMMAND ${NPM_CLI} run build -- --onnxruntime-build-dir=${CMAKE_CURRENT_BINARY_DIR} --config=${CMAKE_BUILD_TYPE} --onnxruntime-generator=${CMAKE_GENERATOR}
-        --arch=${NODEJS_BINDING_ARCH} ${NODEJS_BINDING_USE_CUDA} ${NODEJS_BINDING_USE_DML} ${NODEJS_BINDING_USE_TENSORRT}
+        --arch=${NODEJS_BINDING_ARCH} ${NODEJS_BINDING_USE_CUDA} ${NODEJS_BINDING_USE_DML} ${NODEJS_BINDING_USE_WEBGPU} ${NODEJS_BINDING_USE_TENSORRT}
         ${NODEJS_BINDING_USE_COREML} ${NODEJS_BINDING_USE_QNN}
     WORKING_DIRECTORY ${JS_NODE_ROOT}
     COMMENT "Using cmake-js to build OnnxRuntime Node.js binding")
diff --git a/cmake/onnxruntime_objectivec.cmake b/cmake/onnxruntime_objectivec.cmake
index 4be2f51a96ebc..7c9831f0194d0 100644
--- a/cmake/onnxruntime_objectivec.cmake
+++ b/cmake/onnxruntime_objectivec.cmake
@@ -9,20 +9,6 @@ if(NOT onnxruntime_BUILD_SHARED_LIB)
     message(FATAL_ERROR "The Objective-C API requires onnxruntime_BUILD_SHARED_LIB to be enabled.")
 endif()
 
-check_language(OBJC)
-if(CMAKE_OBJC_COMPILER)
-    enable_language(OBJC)
-else()
-    message(FATAL_ERROR "Objective-C is not supported.")
-endif()
-
-check_language(OBJCXX)
-if(CMAKE_OBJCXX_COMPILER)
-    enable_language(OBJCXX)
-else()
-    message(FATAL_ERROR "Objective-C++ is not supported.")
-endif()
-
 add_compile_options(
     "$<$<COMPILE_LANGUAGE:OBJC,OBJCXX>:-Wall>"
     "$<$<COMPILE_LANGUAGE:OBJC,OBJCXX>:-Wextra>")
diff --git a/cmake/onnxruntime_providers.cmake b/cmake/onnxruntime_providers.cmake
index 6bae330c8b4c1..0df84854a20c0 100644
--- a/cmake/onnxruntime_providers.cmake
+++ b/cmake/onnxruntime_providers.cmake
@@ -114,6 +114,9 @@ endif()
 if(onnxruntime_USE_WEBNN)
   set(PROVIDERS_WEBNN onnxruntime_providers_webnn)
 endif()
+if(onnxruntime_USE_WEBGPU)
+  set(PROVIDERS_WEBGPU onnxruntime_providers_webgpu)
+endif()
 if (onnxruntime_USE_CANN)
   set(PROVIDERS_CANN onnxruntime_providers_cann)
 endif()
@@ -155,6 +158,10 @@ if (onnxruntime_USE_WEBNN)
   include(onnxruntime_providers_webnn.cmake)
 endif()
 
+if (onnxruntime_USE_WEBGPU)
+  include(onnxruntime_providers_webgpu.cmake)
+endif()
+
 if (onnxruntime_USE_NNAPI_BUILTIN)
   include(onnxruntime_providers_nnapi.cmake)
 endif()
diff --git a/cmake/onnxruntime_providers_cpu.cmake b/cmake/onnxruntime_providers_cpu.cmake
index 295a8bbca70f7..91a2b13002ec9 100644
--- a/cmake/onnxruntime_providers_cpu.cmake
+++ b/cmake/onnxruntime_providers_cpu.cmake
@@ -40,6 +40,11 @@ file(GLOB_RECURSE onnxruntime_js_contrib_ops_cc_srcs CONFIGURE_DEPENDS
   "${ONNXRUNTIME_ROOT}/contrib_ops/js/*.cc"
 )
 
+file(GLOB_RECURSE onnxruntime_webgpu_contrib_ops_cc_srcs CONFIGURE_DEPENDS
+  "${ONNXRUNTIME_ROOT}/contrib_ops/webgpu/*.h"
+  "${ONNXRUNTIME_ROOT}/contrib_ops/webgpu/*.cc"
+)
+
 file(GLOB onnxruntime_providers_common_srcs CONFIGURE_DEPENDS
   "${ONNXRUNTIME_ROOT}/core/providers/*.h"
   "${ONNXRUNTIME_ROOT}/core/providers/*.cc"
diff --git a/cmake/onnxruntime_providers_webgpu.cmake b/cmake/onnxruntime_providers_webgpu.cmake
new file mode 100644
index 0000000000000..eb25c55ab23e0
--- /dev/null
+++ b/cmake/onnxruntime_providers_webgpu.cmake
@@ -0,0 +1,27 @@
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# Licensed under the MIT License.
+
+  if (onnxruntime_MINIMAL_BUILD AND NOT onnxruntime_EXTENDED_MINIMAL_BUILD)
+    message(FATAL_ERROR "WebGPU EP can not be used in a basic minimal build. Please build with '--minimal_build extended'")
+  endif()
+
+  add_compile_definitions(USE_WEBGPU=1)
+  if (onnxruntime_ENABLE_WEBASSEMBLY_THREADS)
+    add_definitions(-DENABLE_WEBASSEMBLY_THREADS=1)
+  endif()
+  file(GLOB_RECURSE onnxruntime_providers_webgpu_cc_srcs CONFIGURE_DEPENDS
+    "${ONNXRUNTIME_ROOT}/core/providers/webgpu/*.h"
+    "${ONNXRUNTIME_ROOT}/core/providers/webgpu/*.cc"
+  )
+  if(NOT onnxruntime_DISABLE_CONTRIB_OPS)
+    source_group(TREE ${ONNXRUNTIME_ROOT} FILES ${onnxruntime_webgpu_contrib_ops_cc_srcs})
+    list(APPEND onnxruntime_providers_webgpu_cc_srcs ${onnxruntime_webgpu_contrib_ops_cc_srcs})
+  endif()
+
+  source_group(TREE ${REPO_ROOT} FILES ${onnxruntime_providers_webgpu_cc_srcs})
+  onnxruntime_add_static_library(onnxruntime_providers_webgpu ${onnxruntime_providers_webgpu_cc_srcs})
+  onnxruntime_add_include_to_target(onnxruntime_providers_webgpu
+    onnxruntime_common dawn::dawncpp_headers dawn::dawn_headers onnx onnx_proto flatbuffers::flatbuffers Boost::mp11 safeint_interface)
+  target_link_libraries(onnxruntime_providers_webgpu dawn::dawn_native dawn::dawn_proc)
+
+  set_target_properties(onnxruntime_providers_webgpu PROPERTIES FOLDER "ONNXRuntime")
diff --git a/cmake/onnxruntime_python.cmake b/cmake/onnxruntime_python.cmake
index cb69886ce671a..0d038d210ea2b 100644
--- a/cmake/onnxruntime_python.cmake
+++ b/cmake/onnxruntime_python.cmake
@@ -178,6 +178,7 @@ target_link_libraries(onnxruntime_pybind11_state PRIVATE
     ${PROVIDERS_ACL}
     ${PROVIDERS_ARMNN}
     ${PROVIDERS_XNNPACK}
+    ${PROVIDERS_WEBGPU}
     ${PROVIDERS_AZURE}
     ${PROVIDERS_QNN}
     onnxruntime_optimizer
diff --git a/cmake/onnxruntime_unittests.cmake b/cmake/onnxruntime_unittests.cmake
index 7fbedb6059621..e148215200e4f 100644
--- a/cmake/onnxruntime_unittests.cmake
+++ b/cmake/onnxruntime_unittests.cmake
@@ -563,6 +563,10 @@ if(onnxruntime_USE_JSEP)
   list(APPEND onnxruntime_test_providers_dependencies onnxruntime_providers_js)
 endif()
 
+if(onnxruntime_USE_WEBGPU)
+  list(APPEND onnxruntime_test_providers_dependencies onnxruntime_providers_webgpu)
+endif()
+
 if(onnxruntime_USE_RKNPU)
   list(APPEND onnxruntime_test_providers_dependencies onnxruntime_providers_rknpu)
 endif()
@@ -608,6 +612,7 @@ set(ONNXRUNTIME_TEST_LIBS
     ${PROVIDERS_NNAPI}
     ${PROVIDERS_VSINPU}
     ${PROVIDERS_JS}
+    ${PROVIDERS_WEBGPU}
     ${PROVIDERS_QNN}
     ${PROVIDERS_SNPE}
     ${PROVIDERS_RKNPU}
@@ -670,6 +675,13 @@ if(onnxruntime_USE_JSEP)
   list(APPEND onnxruntime_test_providers_libs onnxruntime_providers_js)
 endif()
 
+if(onnxruntime_USE_WEBGPU)
+  list(APPEND onnxruntime_test_framework_src_patterns  ${TEST_SRC_DIR}/providers/webgpu/*)
+  list(APPEND onnxruntime_test_framework_libs onnxruntime_providers_webgpu)
+  list(APPEND onnxruntime_test_providers_dependencies onnxruntime_providers_webgpu)
+  list(APPEND onnxruntime_test_providers_libs onnxruntime_providers_webgpu)
+endif()
+
 # QNN EP tests require CPU EP op implementations for accuracy evaluation, so disable on minimal
 # or reduced op builds.
 if(onnxruntime_USE_QNN AND NOT onnxruntime_MINIMAL_BUILD AND NOT onnxruntime_REDUCED_OPS_BUILD)
diff --git a/cmake/patches/dawn/dawn.patch b/cmake/patches/dawn/dawn.patch
new file mode 100644
index 0000000000000..d696d386452e8
--- /dev/null
+++ b/cmake/patches/dawn/dawn.patch
@@ -0,0 +1,66 @@
+diff --git a/src/dawn/native/CMakeLists.txt b/src/dawn/native/CMakeLists.txt
+index 9c0bd6fa4e..bf8a57aeac 100644
+--- a/src/dawn/native/CMakeLists.txt
++++ b/src/dawn/native/CMakeLists.txt
+@@ -857,6 +857,11 @@ if (DAWN_ENABLE_SWIFTSHADER)
+     target_compile_definitions(dawn_native PRIVATE "DAWN_ENABLE_SWIFTSHADER")
+ endif()
+
++if (IOS)
++    target_compile_options(dawn_native_objects PRIVATE -fno-objc-arc)
++    target_compile_options(dawn_native PRIVATE -fno-objc-arc)
++endif()
++
+ if (DAWN_BUILD_MONOLITHIC_LIBRARY)
+     ###############################################################################
+     # Do the 'complete_lib' build.
+diff --git a/src/dawn/native/Surface_metal.mm b/src/dawn/native/Surface_metal.mm
+index ce55acbd43..baa4835362 100644
+--- a/src/dawn/native/Surface_metal.mm
++++ b/src/dawn/native/Surface_metal.mm
+@@ -36,7 +36,13 @@
+ namespace dawn::native {
+
+ bool InheritsFromCAMetalLayer(void* obj) {
+-    id<NSObject> object = static_cast<id>(obj);
++    id<NSObject> object =
++#if TARGET_OS_IOS
++        (__bridge id)obj;
++#else
++        static_cast<id>(obj);
++#endif
++
+     return [object isKindOfClass:[CAMetalLayer class]];
+ }
+
+diff --git a/src/dawn/native/metal/SharedFenceMTL.mm b/src/dawn/native/metal/SharedFenceMTL.mm
+index bde8bfea07..f2f6459e91 100644
+--- a/src/dawn/native/metal/SharedFenceMTL.mm
++++ b/src/dawn/native/metal/SharedFenceMTL.mm
+@@ -40,7 +40,13 @@ ResultOrError<Ref<SharedFence>> SharedFence::Create(
+     DAWN_INVALID_IF(descriptor->sharedEvent == nullptr, "MTLSharedEvent is missing.");
+     if (@available(macOS 10.14, iOS 12.0, *)) {
+         return AcquireRef(new SharedFence(
+-            device, label, static_cast<id<MTLSharedEvent>>(descriptor->sharedEvent)));
++            device, label,
++#if TARGET_OS_IOS
++            (__bridge id<MTLSharedEvent>)(descriptor->sharedEvent)
++#else
++            static_cast<id<MTLSharedEvent>>(descriptor->sharedEvent)
++#endif
++            ));
+     } else {
+         return DAWN_INTERNAL_ERROR("MTLSharedEvent not supported.");
+     }
+diff --git a/src/tint/api/BUILD.cmake b/src/tint/api/BUILD.cmake
+index 0037d83276..6372c4ee77 100644
+--- a/src/tint/api/BUILD.cmake
++++ b/src/tint/api/BUILD.cmake
+@@ -57,6 +57,7 @@ tint_target_add_dependencies(tint_api lib
+   tint_lang_wgsl_ast_transform
+   tint_lang_wgsl_common
+   tint_lang_wgsl_features
++  tint_lang_wgsl_inspector
+   tint_lang_wgsl_program
+   tint_lang_wgsl_sem
+   tint_lang_wgsl_writer_ir_to_program
diff --git a/include/onnxruntime/core/graph/constants.h b/include/onnxruntime/core/graph/constants.h
index f76fae360a9f9..8b3e5e48e7004 100644
--- a/include/onnxruntime/core/graph/constants.h
+++ b/include/onnxruntime/core/graph/constants.h
@@ -51,6 +51,7 @@ constexpr const char* kSnpeExecutionProvider = "SNPEExecutionProvider";
 constexpr const char* kTvmExecutionProvider = "TvmExecutionProvider";
 constexpr const char* kXnnpackExecutionProvider = "XnnpackExecutionProvider";
 constexpr const char* kWebNNExecutionProvider = "WebNNExecutionProvider";
+constexpr const char* kWebGpuExecutionProvider = "WebGpuExecutionProvider";
 constexpr const char* kCannExecutionProvider = "CANNExecutionProvider";
 constexpr const char* kAzureExecutionProvider = "AzureExecutionProvider";
 constexpr const char* kVSINPUExecutionProvider = "VSINPUExecutionProvider";
diff --git a/include/onnxruntime/core/providers/webgpu/webgpu_provider_factory.h b/include/onnxruntime/core/providers/webgpu/webgpu_provider_factory.h
new file mode 100644
index 0000000000000..0b45b847d651f
--- /dev/null
+++ b/include/onnxruntime/core/providers/webgpu/webgpu_provider_factory.h
@@ -0,0 +1,14 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+// Dummy file to provide a signal in the ONNX Runtime C cocoapod as to whether the WebGPU EP was included in the build.
+// If it was, this file will be included in the cocoapod, and a test like this can be used:
+//
+//   #if __has_include(<onnxruntime/webgpu_provider_factory.h>)
+//     #define WEBGPU_EP_AVAILABLE 1
+//   #else
+//     #define WEBGPU_EP_AVAILABLE 0
+//   #endif
+
+// The WebGPU EP can be enabled via the generic SessionOptionsAppendExecutionProvider method, so no direct usage of
+// the provider factory is required.
diff --git a/java/src/main/java/ai/onnxruntime/OrtProvider.java b/java/src/main/java/ai/onnxruntime/OrtProvider.java
index 0e2883fe23088..1740ac7eeef00 100644
--- a/java/src/main/java/ai/onnxruntime/OrtProvider.java
+++ b/java/src/main/java/ai/onnxruntime/OrtProvider.java
@@ -42,7 +42,9 @@ public enum OrtProvider {
   /** The Azure remote endpoint execution provider. */
   AZURE("AzureExecutionProvider"),
   /** The QNN execution provider. */
-  QNN("QNNExecutionProvider");
+  QNN("QNNExecutionProvider"),
+  /** The WebGPU execution provider */
+  WEBGPU("WebGpuExecutionProvider");
 
   private static final Map<String, OrtProvider> valueMap = new HashMap<>(values().length);
 
diff --git a/onnxruntime/contrib_ops/webgpu/webgpu_contrib_kernels.cc b/onnxruntime/contrib_ops/webgpu/webgpu_contrib_kernels.cc
new file mode 100644
index 0000000000000..8ed1372cd0e62
--- /dev/null
+++ b/onnxruntime/contrib_ops/webgpu/webgpu_contrib_kernels.cc
@@ -0,0 +1,34 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "contrib_ops/webgpu/webgpu_contrib_kernels.h"
+
+#include "core/framework/op_kernel.h"
+
+namespace onnxruntime {
+namespace contrib {
+namespace webgpu {
+
+template <>
+KernelCreateInfo BuildKernelCreateInfo<void>() {
+  KernelCreateInfo info;
+  return info;
+}
+
+Status RegisterWebGpuContribKernels(KernelRegistry& kernel_registry) {
+  static const BuildKernelCreateInfoFn function_table[] = {
+      BuildKernelCreateInfo<void>,  // default entry to avoid the list become empty after ops-reducing
+  };
+
+  for (auto& function_table_entry : function_table) {
+    KernelCreateInfo info = function_table_entry();
+    if (info.kernel_def != nullptr) {  // filter disabled entries where type is void
+      ORT_RETURN_IF_ERROR(kernel_registry.Register(std::move(info)));
+    }
+  }
+  return Status::OK();
+}
+
+}  // namespace webgpu
+}  // namespace contrib
+}  // namespace onnxruntime
diff --git a/onnxruntime/contrib_ops/webgpu/webgpu_contrib_kernels.h b/onnxruntime/contrib_ops/webgpu/webgpu_contrib_kernels.h
new file mode 100644
index 0000000000000..d73859de78239
--- /dev/null
+++ b/onnxruntime/contrib_ops/webgpu/webgpu_contrib_kernels.h
@@ -0,0 +1,20 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "core/framework/kernel_registry.h"
+
+namespace onnxruntime {
+namespace contrib {
+namespace webgpu {
+
+// forward declaration for this EP's namespace.
+template <typename T>
+KernelCreateInfo BuildKernelCreateInfo();
+
+Status RegisterWebGpuContribKernels(KernelRegistry& kernel_registry);
+
+}  // namespace webgpu
+}  // namespace contrib
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/platform/apple/logging/apple_log_sink.mm b/onnxruntime/core/platform/apple/logging/apple_log_sink.mm
index 00e691a8f9fd3..6abbe76a7f151 100644
--- a/onnxruntime/core/platform/apple/logging/apple_log_sink.mm
+++ b/onnxruntime/core/platform/apple/logging/apple_log_sink.mm
@@ -7,8 +7,6 @@
 
 #include <sstream>
 
-#include "date/date.h"
-
 namespace onnxruntime {
 namespace logging {
 
diff --git a/onnxruntime/core/providers/get_execution_providers.cc b/onnxruntime/core/providers/get_execution_providers.cc
index 61c035bc29ed5..d2a72c3a38b03 100644
--- a/onnxruntime/core/providers/get_execution_providers.cc
+++ b/onnxruntime/core/providers/get_execution_providers.cc
@@ -162,6 +162,14 @@ constexpr ProviderInfo kProvidersInPriorityOrder[] =
             true,
 #else
             false,
+#endif
+        },
+        {
+            kWebGpuExecutionProvider,
+#ifdef USE_WEBGPU
+            true,
+#else
+            false,
 #endif
         },
         {
diff --git a/onnxruntime/core/providers/provider_factory_creators.h b/onnxruntime/core/providers/provider_factory_creators.h
index da5de83a29c11..25d02a48c13c5 100644
--- a/onnxruntime/core/providers/provider_factory_creators.h
+++ b/onnxruntime/core/providers/provider_factory_creators.h
@@ -95,6 +95,10 @@
 #include "core/providers/webnn/webnn_provider_factory_creator.h"
 #endif
 
+#if defined(USE_WEBGPU)
+#include "core/providers/webgpu/webgpu_provider_factory_creator.h"
+#endif
+
 #if defined(USE_CANN)
 #include "core/providers/cann/cann_provider_factory_creator.h"
 #endif
diff --git a/onnxruntime/core/providers/webgpu/symbols.txt b/onnxruntime/core/providers/webgpu/symbols.txt
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/onnxruntime/core/providers/webgpu/webgpu_execution_provider.cc b/onnxruntime/core/providers/webgpu/webgpu_execution_provider.cc
new file mode 100644
index 0000000000000..00ebdd5583d2e
--- /dev/null
+++ b/onnxruntime/core/providers/webgpu/webgpu_execution_provider.cc
@@ -0,0 +1,108 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "core/providers/webgpu/webgpu_execution_provider.h"
+
+#include <string_view>
+#include <unordered_map>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+
+#ifndef DISABLE_CONTRIB_OPS
+#include "contrib_ops/webgpu/webgpu_contrib_kernels.h"
+#endif
+
+#include "core/framework/compute_capability.h"
+#include "core/framework/data_transfer_manager.h"
+#include "core/framework/fallback_cpu_capability.h"
+#include "core/framework/kernel_registry.h"
+#include "core/graph/function_utils.h"
+#include "core/graph/indexed_sub_graph.h"
+
+namespace onnxruntime {
+
+namespace webgpu {
+template <>
+KernelCreateInfo BuildKernelCreateInfo<void>() {
+  KernelCreateInfo info;
+  return info;
+}
+
+class Memcpy final : public OpKernel {
+ public:
+  Memcpy(const OpKernelInfo& info) : OpKernel(info) {}
+
+  Status Compute(OpKernelContext* ctx) const override {
+    const auto* X = ctx->Input<Tensor>(0);
+    Tensor* Y = ctx->Output(0, X->Shape());
+    return Info().GetDataTransferManager().CopyTensor(*X, *Y);
+  }
+};
+
+class ONNX_OPERATOR_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 1, MemcpyFromHost);
+class ONNX_OPERATOR_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 1, MemcpyToHost);
+
+ONNX_OPERATOR_KERNEL_EX(
+    MemcpyFromHost,
+    kOnnxDomain,
+    1,
+    kWebGpuExecutionProvider,
+    (*KernelDefBuilder::Create())
+        .InputMemoryType(OrtMemTypeCPU, 0)
+        .ExecQueueId(0)
+        .TypeConstraint("T", DataTypeImpl::AllFixedSizeTensorTypes()),
+    Memcpy);
+
+ONNX_OPERATOR_KERNEL_EX(
+    MemcpyToHost,
+    kOnnxDomain,
+    1,
+    kWebGpuExecutionProvider,
+    (*KernelDefBuilder::Create())
+        .OutputMemoryType(OrtMemTypeCPU, 0)
+        .ExecQueueId(1)
+        .TypeConstraint("T", DataTypeImpl::AllFixedSizeTensorTypes()),
+    Memcpy);
+
+std::unique_ptr<KernelRegistry> RegisterKernels() {
+  auto kernel_registry = std::make_unique<onnxruntime::KernelRegistry>();
+
+  static const BuildKernelCreateInfoFn function_table[] = {
+      BuildKernelCreateInfo<void>,  // default entry to avoid the list becoming empty after ops-reducing
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 1, MemcpyFromHost)>,
+      BuildKernelCreateInfo<ONNX_OPERATOR_KERNEL_CLASS_NAME(kWebGpuExecutionProvider, kOnnxDomain, 1, MemcpyToHost)>,
+  };
+
+  for (auto& function_table_entry : function_table) {
+    KernelCreateInfo info = function_table_entry();
+    if (info.kernel_def != nullptr) {  // filter disabled entries where type is void
+      ORT_THROW_IF_ERROR(kernel_registry->Register(std::move(info)));
+    }
+  }
+
+#ifndef DISABLE_CONTRIB_OPS
+  Status status = ::onnxruntime::contrib::webgpu::RegisterWebGpuContribKernels(*kernel_registry);
+  ORT_ENFORCE(status.IsOK(), "Failed to register WebGPU contrib kernels: " + status.ErrorMessage());
+#endif
+
+  return kernel_registry;
+}
+
+}  // namespace webgpu
+
+using namespace webgpu;
+
+WebGpuExecutionProvider::WebGpuExecutionProvider()
+    : IExecutionProvider{kWebGpuExecutionProvider, OrtDevice(OrtDevice::GPU, OrtDevice::MemType::DEFAULT, 0)} {}
+
+std::shared_ptr<KernelRegistry> WebGpuExecutionProvider::GetKernelRegistry() const {
+  static std::shared_ptr<KernelRegistry> registry = webgpu::RegisterKernels();
+
+  return registry;
+}
+
+WebGpuExecutionProvider::~WebGpuExecutionProvider() {
+}
+
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/webgpu/webgpu_execution_provider.h b/onnxruntime/core/providers/webgpu/webgpu_execution_provider.h
new file mode 100644
index 0000000000000..537ecb9301f67
--- /dev/null
+++ b/onnxruntime/core/providers/webgpu/webgpu_execution_provider.h
@@ -0,0 +1,37 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Copyright (c) 2019, NXP Semiconductor, Inc. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "core/framework/execution_provider.h"
+#include "core/framework/session_options.h"
+#include "core/graph/constants.h"
+#include "core/providers/providers.h"
+
+namespace onnxruntime {
+namespace webgpu {
+
+// forward declaration for this EP's namespace.
+template <typename T>
+KernelCreateInfo BuildKernelCreateInfo();
+
+}  // namespace webgpu
+
+class WebGpuExecutionProvider : public IExecutionProvider {
+ public:
+  WebGpuExecutionProvider();
+  ~WebGpuExecutionProvider() override;
+
+  std::shared_ptr<KernelRegistry> GetKernelRegistry() const override;
+
+  DataLayout GetPreferredLayout() const override { return DataLayout::NHWC; }
+
+  FusionStyle GetFusionStyle() const override { return FusionStyle::FilteredGraphViewer; }
+
+  // WebGPU EP disallow concurrent run because actual implementation (eg. WebGPU backend) relies on global states to
+  // work, and concurrent run with async function may mess up the states and cause undefined behavior.
+  bool ConcurrentRunSupported() const override { return false; }
+};
+
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/webgpu/webgpu_provider_factory.cc b/onnxruntime/core/providers/webgpu/webgpu_provider_factory.cc
new file mode 100644
index 0000000000000..1a1f1a438c750
--- /dev/null
+++ b/onnxruntime/core/providers/webgpu/webgpu_provider_factory.cc
@@ -0,0 +1,24 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include <charconv>
+
+#include "core/framework/error_code_helper.h"
+#include "core/providers/webgpu/webgpu_provider_factory_creator.h"
+#include "core/providers/webgpu/webgpu_execution_provider.h"
+
+namespace onnxruntime {
+
+struct WebGpuProviderFactory : IExecutionProviderFactory {
+  WebGpuProviderFactory() {}
+
+  std::unique_ptr<IExecutionProvider> CreateProvider() override {
+    return std::make_unique<WebGpuExecutionProvider>();
+  }
+};
+
+std::shared_ptr<IExecutionProviderFactory> WebGpuProviderFactoryCreator::Create(const ConfigOptions&) {
+  return std::make_shared<WebGpuProviderFactory>();
+}
+
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/providers/webgpu/webgpu_provider_factory_creator.h b/onnxruntime/core/providers/webgpu/webgpu_provider_factory_creator.h
new file mode 100644
index 0000000000000..6257a85d45760
--- /dev/null
+++ b/onnxruntime/core/providers/webgpu/webgpu_provider_factory_creator.h
@@ -0,0 +1,18 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include <memory>
+
+#include "core/framework/provider_options.h"
+#include "core/providers/providers.h"
+
+namespace onnxruntime {
+struct ConfigOptions;
+
+struct WebGpuProviderFactoryCreator {
+  static std::shared_ptr<IExecutionProviderFactory> Create(const ConfigOptions& config_options);
+};
+
+}  // namespace onnxruntime
diff --git a/onnxruntime/core/session/inference_session.cc b/onnxruntime/core/session/inference_session.cc
index 83e7596d2f6b8..76d34aabab6cb 100644
--- a/onnxruntime/core/session/inference_session.cc
+++ b/onnxruntime/core/session/inference_session.cc
@@ -759,12 +759,12 @@ common::Status InferenceSession::RegisterExecutionProvider(const std::shared_ptr
 
   // Some session option values (default or user provided) may not work with some EPs.
   // Rather than put the onus on the user to know these, make the appropriate change while logging the change.
-  if (provider_type == onnxruntime::kDmlExecutionProvider) {
-    // DML's memory is not byte addressable and hence mem pattern doesn't work.
+  if (provider_type == onnxruntime::kDmlExecutionProvider || provider_type == onnxruntime::kWebGpuExecutionProvider) {
+    // DML and WebGPU memory is not byte addressable and hence mem pattern doesn't work.
     if (session_options_.enable_mem_pattern) {
       LOGS(*session_logger_, INFO)
-          << "Having memory pattern enabled is not supported while using the DML Execution Provider. "
-          << "So disabling it for this session since it uses the DML Execution Provider.";
+          << "Having memory pattern enabled is not supported while using " << provider_type << ". "
+          << "So disabling it for this session since it uses " << provider_type << ".";
       session_options_.enable_mem_pattern = false;
     }
 
diff --git a/onnxruntime/core/session/provider_registration.cc b/onnxruntime/core/session/provider_registration.cc
index 68aeea9f3c4d2..8c512c561ea8c 100644
--- a/onnxruntime/core/session/provider_registration.cc
+++ b/onnxruntime/core/session/provider_registration.cc
@@ -131,6 +131,12 @@ ORT_API_STATUS_IMPL(OrtApis::SessionOptionsAppendExecutionProvider,
     options->provider_factories.push_back(WebNNProviderFactoryCreator::Create(provider_options));
 #else
     status = create_not_supported_status();
+#endif
+  } else if (strcmp(provider_name, "WebGPU") == 0) {
+#if defined(USE_WEBGPU)
+    options->provider_factories.push_back(WebGpuProviderFactoryCreator::Create(options->value.config_options));
+#else
+    status = create_not_supported_status();
 #endif
   } else if (strcmp(provider_name, "AZURE") == 0) {
 #if defined(USE_AZURE)
diff --git a/onnxruntime/python/onnxruntime_pybind_state.cc b/onnxruntime/python/onnxruntime_pybind_state.cc
index cce88db1e61e3..3062738eefcf2 100644
--- a/onnxruntime/python/onnxruntime_pybind_state.cc
+++ b/onnxruntime/python/onnxruntime_pybind_state.cc
@@ -1239,6 +1239,10 @@ std::unique_ptr<IExecutionProvider> CreateExecutionProviderInstance(
     return onnxruntime::XnnpackProviderFactoryCreator::Create(
                cit == provider_options_map.end() ? ProviderOptions{} : cit->second, &session_options)
         ->CreateProvider();
+#endif
+  } else if (type == kWebGpuExecutionProvider) {
+#if defined(USE_WEBGPU)
+    return onnxruntime::WebGpuProviderFactoryCreator::Create(session_options.config_options)->CreateProvider();
 #endif
   } else if (type == kCannExecutionProvider) {
 #ifdef USE_CANN
diff --git a/onnxruntime/test/onnx/main.cc b/onnxruntime/test/onnx/main.cc
index e8c948ade1068..6d86e4c35af85 100644
--- a/onnxruntime/test/onnx/main.cc
+++ b/onnxruntime/test/onnx/main.cc
@@ -48,13 +48,16 @@ void usage() {
       "\t-v: verbose\n"
       "\t-n [test_case_name]: Specifies a single test case to run.\n"
       "\t-e [EXECUTION_PROVIDER]: EXECUTION_PROVIDER could be 'cpu', 'cuda', 'dnnl', 'tensorrt', 'vsinpu'"
-      "'openvino', 'rocm', 'migraphx', 'acl', 'armnn', 'xnnpack', 'nnapi', 'qnn', 'snpe' or 'coreml'. "
+      "'openvino', 'rocm', 'migraphx', 'acl', 'armnn', 'xnnpack', 'webgpu', 'nnapi', 'qnn', 'snpe' or 'coreml'. "
       "Default: 'cpu'.\n"
       "\t-p: Pause after launch, can attach debugger and continue\n"
       "\t-x: Use parallel executor, default (without -x): sequential executor.\n"
       "\t-d [device_id]: Specifies the device id for multi-device (e.g. GPU). The value should > 0\n"
       "\t-t: Specify custom relative tolerance values for output value comparison. default: 1e-5\n"
       "\t-a: Specify custom absolute tolerance values for output value comparison. default: 1e-5\n"
+      "\t-C: Specify session configuration entries as key-value pairs: -C \"<key1>|<value1> <key2>|<value2>\" \n"
+      "\t    Refer to onnxruntime_session_options_config_keys.h for valid keys and values. \n"
+      "\t    [Example] -C \"session.disable_cpu_ep_fallback|1 ep.context_enable|1\" \n"
       "\t-i: Specify EP specific runtime options as key value pairs. Different runtime options available are: \n"
       "\t    [QNN only] [backend_path]: QNN backend path. e.g '/folderpath/libQnnHtp.so', '/folderpath/libQnnCpu.so'.\n"
       "\t    [QNN only] [profiling_level]: QNN profiling level, options:  'basic', 'detailed', default 'off'.\n"
@@ -124,6 +127,39 @@ static TestTolerances LoadTestTolerances(bool enable_cuda, bool enable_openvino,
       overrides_json["atol_default"], overrides_json["rtol_default"], absolute_overrides, relative_overrides);
 }
 
+static bool ParseSessionConfigs(const std::string& configs_string,
+                                std::unordered_map<std::string, std::string>& session_configs) {
+  std::istringstream ss(configs_string);
+  std::string token;
+
+  while (ss >> token) {
+    if (token == "") {
+      continue;
+    }
+
+    std::string_view token_sv(token);
+
+    auto pos = token_sv.find("|");
+    if (pos == std::string_view::npos || pos == 0 || pos == token_sv.length()) {
+      // Error: must use a '|' to separate the key and value for session configuration entries.
+      return false;
+    }
+
+    std::string key(token_sv.substr(0, pos));
+    std::string value(token_sv.substr(pos + 1));
+
+    auto it = session_configs.find(key);
+    if (it != session_configs.end()) {
+      // Error: specified duplicate session configuration entry: {key}
+      return false;
+    }
+
+    session_configs.insert(std::make_pair(std::move(key), std::move(value)));
+  }
+
+  return true;
+}
+
 #ifdef _WIN32
 int GetNumCpuCores() {
   SYSTEM_LOGICAL_PROCESSOR_INFORMATION buffer[256];
@@ -180,6 +216,7 @@ int real_main(int argc, char* argv[], Ort::Env& env) {
   bool enable_armnn = false;
   bool enable_rocm = false;
   bool enable_migraphx = false;
+  bool enable_webgpu = false;
   bool enable_xnnpack = false;
   bool override_tolerance = false;
   double atol = 1e-5;
@@ -189,6 +226,7 @@ int real_main(int argc, char* argv[], Ort::Env& env) {
   bool user_graph_optimization_level_set = false;
   bool set_denormal_as_zero = false;
   std::basic_string<ORTCHAR_T> ep_runtime_config_string;
+  std::unordered_map<std::string, std::string> session_config_entries;
   std::string provider_name = "cpu";
 
   OrtLoggingLevel logging_level = ORT_LOGGING_LEVEL_ERROR;
@@ -199,7 +237,7 @@ int real_main(int argc, char* argv[], Ort::Env& env) {
   bool pause = false;
   {
     int ch;
-    while ((ch = getopt(argc, argv, ORT_TSTR("Ac:hj:Mn:r:e:t:a:xvo:d:i:pzfb"))) != -1) {
+    while ((ch = getopt(argc, argv, ORT_TSTR("Ac:hj:Mn:r:e:t:a:xvo:d:C:i:pzfb"))) != -1) {
       switch (ch) {
         case 'A':
           enable_cpu_mem_arena = false;
@@ -268,6 +306,8 @@ int real_main(int argc, char* argv[], Ort::Env& env) {
             enable_rocm = true;
           } else if (!CompareCString(optarg, ORT_TSTR("migraphx"))) {
             enable_migraphx = true;
+          } else if (!CompareCString(optarg, ORT_TSTR("webgpu"))) {
+            enable_webgpu = true;
           } else if (!CompareCString(optarg, ORT_TSTR("xnnpack"))) {
             enable_xnnpack = true;
           } else {
@@ -324,6 +364,11 @@ int real_main(int argc, char* argv[], Ort::Env& env) {
             return -1;
           }
           break;
+        case 'C':
+          if (!ParseSessionConfigs(ToUTF8String(optarg), session_config_entries)) {
+            return -1;
+          }
+          break;
         case 'i':
           ep_runtime_config_string = optarg;
           break;
@@ -410,6 +455,10 @@ int real_main(int argc, char* argv[], Ort::Env& env) {
     if (disable_ep_context_embed_mode)
       sf.AddConfigEntry(kOrtSessionOptionEpContextEmbedMode, "0");
 
+    for (auto& it : session_config_entries) {
+      sf.AddConfigEntry(it.first.c_str(), it.second.c_str());
+    }
+
     if (enable_tensorrt) {
 #ifdef USE_TENSORRT
       Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_Tensorrt(sf, device_id));
@@ -699,6 +748,15 @@ select from 'TF8', 'TF16', 'UINT8', 'FLOAT', 'ITENSOR'. \n)");
 #endif
     }
 
+    if (enable_webgpu) {
+#ifdef USE_WEBGPU
+      sf.AppendExecutionProvider("WebGPU", {});
+#else
+      fprintf(stderr, "WebGPU is not supported in this build");
+      return -1;
+#endif
+    }
+
     if (user_graph_optimization_level_set) {
       sf.SetGraphOptimizationLevel(graph_optimization_level);
     }
diff --git a/onnxruntime/test/perftest/command_args_parser.cc b/onnxruntime/test/perftest/command_args_parser.cc
index 6e811f4596eab..42b73ec384cf5 100644
--- a/onnxruntime/test/perftest/command_args_parser.cc
+++ b/onnxruntime/test/perftest/command_args_parser.cc
@@ -38,8 +38,8 @@ namespace perftest {
       "\t-A: Disable memory arena\n"
       "\t-I: Generate tensor input binding (Free dimensions are treated as 1.)\n"
       "\t-c [parallel runs]: Specifies the (max) number of runs to invoke simultaneously. Default:1.\n"
-      "\t-e [cpu|cuda|dnnl|tensorrt|openvino|dml|acl|nnapi|coreml|qnn|snpe|rocm|migraphx|xnnpack|vitisai]: Specifies the provider 'cpu','cuda','dnnl','tensorrt', "
-      "'openvino', 'dml', 'acl', 'nnapi', 'coreml', 'qnn', 'snpe', 'rocm', 'migraphx', 'xnnpack' or 'vitisai'. "
+      "\t-e [cpu|cuda|dnnl|tensorrt|openvino|dml|acl|nnapi|coreml|qnn|snpe|rocm|migraphx|xnnpack|vitisai|webgpu]: Specifies the provider 'cpu','cuda','dnnl','tensorrt', "
+      "'openvino', 'dml', 'acl', 'nnapi', 'coreml', 'qnn', 'snpe', 'rocm', 'migraphx', 'xnnpack', 'vitisai' or 'webgpu'. "
       "Default:'cpu'.\n"
       "\t-b [tf|ort]: backend to use. Default:ort\n"
       "\t-r [repeated_times]: Specifies the repeated times if running in 'times' test mode.Default:1000.\n"
@@ -282,6 +282,8 @@ static bool ParseSessionConfigs(const std::string& configs_string,
           test_config.machine_config.provider_type_name = onnxruntime::kXnnpackExecutionProvider;
         } else if (!CompareCString(optarg, ORT_TSTR("vitisai"))) {
           test_config.machine_config.provider_type_name = onnxruntime::kVitisAIExecutionProvider;
+        } else if (!CompareCString(optarg, ORT_TSTR("webgpu"))) {
+          test_config.machine_config.provider_type_name = onnxruntime::kWebGpuExecutionProvider;
         } else {
           return false;
         }
diff --git a/onnxruntime/test/perftest/ort_test_session.cc b/onnxruntime/test/perftest/ort_test_session.cc
index 3ed5eaee5a5f7..eb230ac771e13 100644
--- a/onnxruntime/test/perftest/ort_test_session.cc
+++ b/onnxruntime/test/perftest/ort_test_session.cc
@@ -593,6 +593,13 @@ select from 'TF8', 'TF16', 'UINT8', 'FLOAT', 'ITENSOR'. \n)");
         "XNNPACK", {{"intra_op_num_threads", std::to_string(performance_test_config.run_config.intra_op_num_threads)}});
 #else
     ORT_THROW("Xnnpack is not supported in this build\n");
+#endif
+  } else if (provider_name_ == onnxruntime::kWebGpuExecutionProvider) {
+#ifdef USE_WEBGPU
+    session_options.AppendExecutionProvider(
+        "WebGPU", {{"intra_op_num_threads", std::to_string(performance_test_config.run_config.intra_op_num_threads)}});
+#else
+    ORT_THROW("WebGPU is not supported in this build\n");
 #endif
   } else if (provider_name_ == onnxruntime::kVitisAIExecutionProvider) {
 #ifdef USE_VITISAI
diff --git a/onnxruntime/test/platform/apple/apple_package_test/ios_package_testUITests/ios_package_uitest_cpp_api.mm b/onnxruntime/test/platform/apple/apple_package_test/ios_package_testUITests/ios_package_uitest_cpp_api.mm
index d145a00b1348f..32b4b32e299d6 100644
--- a/onnxruntime/test/platform/apple/apple_package_test/ios_package_testUITests/ios_package_uitest_cpp_api.mm
+++ b/onnxruntime/test/platform/apple/apple_package_test/ios_package_testUITests/ios_package_uitest_cpp_api.mm
@@ -13,15 +13,19 @@
 
 #if __has_include(<onnxruntime/coreml_provider_factory.h>)
 #define COREML_EP_AVAILABLE 1
+#include <onnxruntime/coreml_provider_factory.h>
 #else
 #define COREML_EP_AVAILABLE 0
 #endif
 
-#if COREML_EP_AVAILABLE
-#include <onnxruntime/coreml_provider_factory.h>
+#if __has_include(<onnxruntime/webgpu_provider_factory.h>)
+#define WEBGPU_EP_AVAILABLE 1
+// WebGPU EP doesn't require including the header as it's enabled via AppendExecutionProvider
+#else
+#define WEBGPU_EP_AVAILABLE 0
 #endif
 
-void testSigmoid(const char* modelPath, bool useCoreML) {
+void testSigmoid(const char* modelPath, bool useCoreML = false, bool useWebGPU = false) {
   // This is an e2e test for ORT C++ API
   Ort::Env env(ORT_LOGGING_LEVEL_WARNING, "testCppAPI");
 
@@ -38,6 +42,12 @@ void testSigmoid(const char* modelPath, bool useCoreML) {
   (void)useCoreML;
 #endif
 
+  if (useWebGPU) {
+    std::unordered_map<std::string, std::string> provider_options;
+    // set provider options if needed. e.g. deviceId
+    session_options.AppendExecutionProvider("WebGPU", provider_options);
+  }
+
   Ort::Session session(env, modelPath, session_options);
 
   size_t input_tensor_size = 3 * 4 * 5;
@@ -96,7 +106,7 @@ - (NSString*)getFilePath {
 }
 
 - (void)testCppAPI_Basic {
-  testSigmoid([self getFilePath].UTF8String, false /* useCoreML */);
+  testSigmoid([self getFilePath].UTF8String);
 }
 
 #if COREML_EP_AVAILABLE
@@ -105,4 +115,9 @@ - (void)testCppAPI_Basic_CoreML {
 }
 #endif
 
+#if WEBGPU_EP_AVAILABLE
+- (void)testCppAPI_Basic_WebGPU {
+  testSigmoid([self getFilePath].UTF8String, false /* useCoreML */, true /* useWebGPU */);
+}
+#endif
 @end
diff --git a/onnxruntime/test/platform/apple/apple_package_test/macos_package_testUITests/macos_package_uitest_cpp_api.mm b/onnxruntime/test/platform/apple/apple_package_test/macos_package_testUITests/macos_package_uitest_cpp_api.mm
index 613c6e545939f..86001b6cb50a5 100644
--- a/onnxruntime/test/platform/apple/apple_package_test/macos_package_testUITests/macos_package_uitest_cpp_api.mm
+++ b/onnxruntime/test/platform/apple/apple_package_test/macos_package_testUITests/macos_package_uitest_cpp_api.mm
@@ -13,15 +13,19 @@
 
 #if __has_include(<onnxruntime/coreml_provider_factory.h>)
 #define COREML_EP_AVAILABLE 1
+#include <onnxruntime/coreml_provider_factory.h>
 #else
 #define COREML_EP_AVAILABLE 0
 #endif
 
-#if COREML_EP_AVAILABLE
-#include <onnxruntime/coreml_provider_factory.h>
+#if __has_include(<onnxruntime/webgpu_provider_factory.h>)
+#define WEBGPU_EP_AVAILABLE 1
+// WebGPU EP doesn't require including the header as it's enabled via AppendExecutionProvider
+#else
+#define WEBGPU_EP_AVAILABLE 0
 #endif
 
-void testSigmoid(const char* modelPath, bool useCoreML) {
+void testSigmoid(const char* modelPath, bool useCoreML = false, bool useWebGPU = false) {
   // This is an e2e test for ORT C++ API
   Ort::Env env(ORT_LOGGING_LEVEL_WARNING, "testCppAPI");
 
@@ -38,6 +42,12 @@ void testSigmoid(const char* modelPath, bool useCoreML) {
   (void)useCoreML;
 #endif
 
+  if (useWebGPU) {
+    std::unordered_map<std::string, std::string> provider_options;
+    // set provider options if needed. e.g. deviceId
+    session_options.AppendExecutionProvider("WebGPU", provider_options);
+  }
+
   Ort::Session session(env, modelPath, session_options);
 
   size_t input_tensor_size = 3 * 4 * 5;
@@ -96,7 +106,7 @@ - (NSString*)getFilePath {
 }
 
 - (void)testCppAPI_Basic {
-  testSigmoid([self getFilePath].UTF8String, false /* useCoreML */);
+  testSigmoid([self getFilePath].UTF8String);
 }
 
 #if COREML_EP_AVAILABLE
@@ -105,4 +115,10 @@ - (void)testCppAPI_Basic_CoreML {
 }
 #endif
 
+#if WEBGPU_EP_AVAILABLE
+- (void)testCppAPI_Basic_WebGPU {
+  testSigmoid([self getFilePath].UTF8String, false /* useCoreML */, true /* useWebGPU */);
+}
+#endif
+
 @end
diff --git a/onnxruntime/test/providers/base_tester.cc b/onnxruntime/test/providers/base_tester.cc
index 9625b67be79c0..9a17e1b610017 100644
--- a/onnxruntime/test/providers/base_tester.cc
+++ b/onnxruntime/test/providers/base_tester.cc
@@ -657,6 +657,7 @@ void BaseTester::RunWithConfig(size_t* number_of_pre_packed_weights_counter,
           kQnnExecutionProvider,
           kSnpeExecutionProvider,
           kXnnpackExecutionProvider,
+          kWebGpuExecutionProvider,
       };
 
       // need to special case any synthetic EP names in the exclude list
@@ -712,6 +713,8 @@ void BaseTester::RunWithConfig(size_t* number_of_pre_packed_weights_counter,
           execution_provider = DefaultXnnpackExecutionProvider();
         else if (provider_type == onnxruntime::kDmlExecutionProvider)
           execution_provider = DefaultDmlExecutionProvider();
+        else if (provider_type == onnxruntime::kWebGpuExecutionProvider)
+          execution_provider = DefaultWebGpuExecutionProvider();
         else if (provider_type == onnxruntime::kNvDmlExecutionProvider)
           execution_provider = DefaultNvDmlExecutionProvider();
 
diff --git a/onnxruntime/test/providers/compare_provider_test_utils.cc b/onnxruntime/test/providers/compare_provider_test_utils.cc
index 3ef74259e27b6..386a5656d8a01 100644
--- a/onnxruntime/test/providers/compare_provider_test_utils.cc
+++ b/onnxruntime/test/providers/compare_provider_test_utils.cc
@@ -36,6 +36,8 @@ std::unique_ptr<IExecutionProvider> GetExecutionProvider(const std::string& prov
     execution_provider = DefaultRocmExecutionProvider();
   else if (provider_type == onnxruntime::kDmlExecutionProvider)
     execution_provider = DefaultDmlExecutionProvider();
+  else if (provider_type == onnxruntime::kWebGpuExecutionProvider)
+    execution_provider = DefaultWebGpuExecutionProvider();
   // skip if execution provider is disabled
   if (execution_provider == nullptr) {
     return nullptr;
diff --git a/onnxruntime/test/util/default_providers.cc b/onnxruntime/test/util/default_providers.cc
index edc6bda12b5f7..e4cff3acfa08d 100644
--- a/onnxruntime/test/util/default_providers.cc
+++ b/onnxruntime/test/util/default_providers.cc
@@ -302,6 +302,15 @@ std::unique_ptr<IExecutionProvider> DefaultXnnpackExecutionProvider() {
 #endif
 }
 
+std::unique_ptr<IExecutionProvider> DefaultWebGpuExecutionProvider() {
+#ifdef USE_WEBGPU
+  ConfigOptions config_options{};
+  return WebGpuProviderFactoryCreator::Create(config_options)->CreateProvider();
+#else
+  return nullptr;
+#endif
+}
+
 std::unique_ptr<IExecutionProvider> DefaultCannExecutionProvider() {
 #ifdef USE_CANN
   OrtCANNProviderOptions provider_options{};
diff --git a/onnxruntime/test/util/include/default_providers.h b/onnxruntime/test/util/include/default_providers.h
index 8ba573cfe0f54..6a6a9c89bd5ae 100644
--- a/onnxruntime/test/util/include/default_providers.h
+++ b/onnxruntime/test/util/include/default_providers.h
@@ -63,6 +63,7 @@ std::unique_ptr<IExecutionProvider> DefaultQnnExecutionProvider();
 std::unique_ptr<IExecutionProvider> QnnExecutionProviderWithOptions(const ProviderOptions& options,
                                                                     const SessionOptions* session_options = nullptr);
 std::unique_ptr<IExecutionProvider> DefaultXnnpackExecutionProvider();
+std::unique_ptr<IExecutionProvider> DefaultWebGpuExecutionProvider();
 std::unique_ptr<IExecutionProvider> DefaultCannExecutionProvider();
 std::unique_ptr<IExecutionProvider> DefaultDmlExecutionProvider();
 std::unique_ptr<IExecutionProvider> DefaultNvDmlExecutionProvider();
diff --git a/tools/ci_build/build.py b/tools/ci_build/build.py
index 7cc0655532c4d..b734902a6ec05 100644
--- a/tools/ci_build/build.py
+++ b/tools/ci_build/build.py
@@ -571,6 +571,7 @@ def convert_arg_line_to_args(self, arg_line):
         "--nnapi_min_api", type=int, help="Minimum Android API level to enable NNAPI, should be no less than 27"
     )
     parser.add_argument("--use_jsep", action="store_true", help="Build with JavaScript kernels.")
+    parser.add_argument("--use_webgpu", action="store_true", help="Build with WebGPU support.")
     parser.add_argument("--use_qnn", action="store_true", help="Build with QNN support.")
     parser.add_argument("--qnn_home", help="Path to QNN SDK dir.")
     parser.add_argument("--use_rknpu", action="store_true", help="Build with RKNPU.")
@@ -1058,6 +1059,7 @@ def generate_build_tree(
         "-Donnxruntime_ARMNN_RELU_USE_CPU=" + ("OFF" if args.armnn_relu else "ON"),
         "-Donnxruntime_ARMNN_BN_USE_CPU=" + ("OFF" if args.armnn_bn else "ON"),
         "-Donnxruntime_USE_JSEP=" + ("ON" if args.use_jsep else "OFF"),
+        "-Donnxruntime_USE_WEBGPU=" + ("ON" if args.use_webgpu else "OFF"),
         # Training related flags
         "-Donnxruntime_ENABLE_NVTX_PROFILE=" + ("ON" if args.enable_nvtx_profile else "OFF"),
         "-Donnxruntime_ENABLE_TRAINING=" + ("ON" if args.enable_training else "OFF"),
@@ -1317,6 +1319,9 @@ def generate_build_tree(
             raise BuildError("WebNN is only available for WebAssembly build.")
         cmake_args += ["-Donnxruntime_USE_WEBNN=ON"]
 
+    if args.use_jsep and args.use_webgpu:
+        raise BuildError("JSEP (--use_jsep) and WebGPU (--use_webgpu) cannot be enabled at the same time.")
+
     if args.use_snpe:
         cmake_args += ["-Donnxruntime_USE_SNPE=ON"]
 
diff --git a/tools/ci_build/gen_def.py b/tools/ci_build/gen_def.py
index c4add6f0e8910..2b7790ec4e683 100755
--- a/tools/ci_build/gen_def.py
+++ b/tools/ci_build/gen_def.py
@@ -80,6 +80,7 @@ def parse_arguments():
             "dnnl",
             "tensorrt",
             "azure",
+            "webgpu",
         ):
             file.write(f"#include <core/providers/{c}/{c}_provider_factory.h>\n")
     file.write("void* GetFunctionEntryByName(const char* name){\n")
diff --git a/tools/ci_build/github/apple/build_and_assemble_apple_pods.py b/tools/ci_build/github/apple/build_and_assemble_apple_pods.py
index 71aeb9e7b0304..dd037c17ae3b3 100755
--- a/tools/ci_build/github/apple/build_and_assemble_apple_pods.py
+++ b/tools/ci_build/github/apple/build_and_assemble_apple_pods.py
@@ -133,6 +133,8 @@ def main():
             str(build_dir / "framework_out"),
             "--variant",
             package_variant.name,
+            "--test_project_stage_dir",  # use a specific directory so it's easier to debug
+            str(build_dir / "test_apple_packages_staging"),
         ]
 
         run(test_apple_packages_args)
diff --git a/tools/ci_build/github/azure-pipelines/win-gpu-webgpu-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/win-gpu-webgpu-ci-pipeline.yml
new file mode 100644
index 0000000000000..c4db7735aaf2f
--- /dev/null
+++ b/tools/ci_build/github/azure-pipelines/win-gpu-webgpu-ci-pipeline.yml
@@ -0,0 +1,58 @@
+##### start trigger Don't edit it manually, Please do edit set-trigger-rules.py ####
+### please do rerun set-trigger-rules.py ###
+trigger:
+  branches:
+    include:
+    - main
+    - rel-*
+  paths:
+    exclude:
+    - docs/**
+    - README.md
+    - CONTRIBUTING.md
+    - BUILD.md
+    - 'js/web'
+    - 'onnxruntime/core/providers/js'
+pr:
+  branches:
+    include:
+    - main
+    - rel-*
+  paths:
+    exclude:
+    - docs/**
+    - README.md
+    - CONTRIBUTING.md
+    - BUILD.md
+    - 'js/web'
+    - 'onnxruntime/core/providers/js'
+#### end trigger ####
+
+parameters:
+- name: RunOnnxRuntimeTests
+  displayName: Run Tests?
+  type: boolean
+  default: true
+
+stages:
+- stage: webgpu
+  dependsOn: []
+  jobs:
+    - template: templates/jobs/win-ci-vs-2022-job.yml
+      parameters:
+        BuildConfig: 'RelWithDebInfo'
+        EnvSetupScript: setup_env_cuda.bat
+        buildArch: x64
+        # add --enable_pybind and --build_java if necessary
+        additionalBuildFlags: >-
+          --build_nodejs
+          --use_webgpu
+          --cmake_extra_defines onnxruntime_BUILD_UNIT_TESTS=ON
+        msbuildPlatform: x64
+        isX86: false
+        job_name_suffix: x64_RelWithDebInfo
+        RunOnnxRuntimeTests: ${{ parameters.RunOnnxRuntimeTests }}
+        ORT_EP_NAME: WebGPU
+        EnablePython: false
+        WITH_CACHE: true
+        MachinePool: onnxruntime-Win2022-VS2022-webgpu-A10
diff --git a/tools/ci_build/set-trigger-rules.py b/tools/ci_build/set-trigger-rules.py
index fb6aa44cdf31a..0e9cd514d8aa5 100644
--- a/tools/ci_build/set-trigger-rules.py
+++ b/tools/ci_build/set-trigger-rules.py
@@ -40,6 +40,7 @@
     "win-gpu-training-ci-pipeline.yml",
     "win-gpu-doc-gen-ci-pipeline.yml",
     "win-gpu-tensorrt-ci-pipeline.yml",
+    "win-gpu-webgpu-ci-pipeline.yml",
     "win-qnn-arm64-ci-pipeline.yml",
     "win-qnn-ci-pipeline.yml",
 ]