From 99a8400e903ab330e3067629d9aa4e23ce82cf12 Mon Sep 17 00:00:00 2001
From: zesongw <zesong.wang@intel.com>
Date: Tue, 9 Jan 2024 09:16:52 +0800
Subject: [PATCH 01/29] [WebNN EP] Fall back resize nearest mode for WebNN CPU
 backend (#19039)

WebNN CPU backend only supports linear mode. Fall back for this case.
---
 .../webnn/builders/impl/resize_op_builder.cc  | 19 ++++++++++++++-----
 1 file changed, 14 insertions(+), 5 deletions(-)

diff --git a/onnxruntime/core/providers/webnn/builders/impl/resize_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/resize_op_builder.cc
index ea9fc379ee23f..186d1e7c1035a 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/resize_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/resize_op_builder.cc
@@ -30,7 +30,7 @@ class ResizeOpBuilder : public BaseOpBuilder {
   // Operator support related.
  private:
   bool IsOpSupportedImpl(const InitializedTensorSet& initializers, const Node& node,
-                         const WebnnDeviceType /* device_type */, const logging::Logger& logger) const override;
+                         const WebnnDeviceType device_type, const logging::Logger& logger) const override;
 
   // Resize opset 10- is very different than Resize opset 11+, with many key attributes missing.
   // We only support Resize opset 11+ here.
@@ -161,7 +161,7 @@ Status ResizeOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
 
 bool ResizeOpBuilder::IsOpSupportedImpl(const InitializedTensorSet& initializers,
                                         const Node& node,
-                                        const WebnnDeviceType /* device_type */,
+                                        const WebnnDeviceType device_type,
                                         const logging::Logger& logger) const {
   const auto& input_defs = node.InputDefs();
 
@@ -181,9 +181,18 @@ bool ResizeOpBuilder::IsOpSupportedImpl(const InitializedTensorSet& initializers
     const auto mode = helper.Get("mode", "nearest");
     bool is_linear_resize = mode == "linear";
     bool is_nearest_resize = mode == "nearest";
-    if (!is_linear_resize && !is_nearest_resize) {
-      LOGS(logger, VERBOSE) << "Resize unsupported input mode, " << mode;
-      return false;
+    // WebNN CPU backend only supports "linear" mode.
+    // WebNN GPU backend only supports "linear" and "nearest" modes.
+    if (device_type == WebnnDeviceType::CPU) {
+      if (!is_linear_resize) {
+        LOGS(logger, VERBOSE) << "Resize unsupported input mode, " << mode << " for CPU backend.";
+        return false;
+      }
+    } else {
+      if (!is_linear_resize && !is_nearest_resize) {
+        LOGS(logger, VERBOSE) << "Resize unsupported input mode, " << mode << " for GPU backend.";
+        return false;
+      }
     }
 
     const auto exclude_outside = helper.Get("exclude_outside", 0);

From 975a315cd70aac54a6c4ff8b7d4e0a76d25666de Mon Sep 17 00:00:00 2001
From: Jeff Bloomfield <38966965+jeffbloo@users.noreply.github.com>
Date: Mon, 8 Jan 2024 17:49:19 -0800
Subject: [PATCH 02/29] Fix x86 build error in GraphDescBuilder.cpp affecting
 packaging pipeline (#19045)

### Description
This addresses a 32 bit build error affecting the packaging pipeline


### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
---
 .../providers/dml/DmlExecutionProvider/src/GraphDescBuilder.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/GraphDescBuilder.cpp b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/GraphDescBuilder.cpp
index adb4fd131119f..c6a15e76f4736 100644
--- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/GraphDescBuilder.cpp
+++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/GraphDescBuilder.cpp
@@ -360,7 +360,7 @@ namespace Dml::GraphDescBuilder
                                 // The tensor description's size should be no larger than the constant input unless it was rounded to 
                                 // the required alignment.
                                 assert(((constantInput->GetTensorByteSize() + 3) & ~3) >= tensorDesc->totalTensorSizeInBytes);
-                                size_t minimumConstantSize = std::min(constantInput->GetTensorByteSize(), tensorDesc->totalTensorSizeInBytes);
+                                size_t minimumConstantSize = std::min(constantInput->GetTensorByteSize(), gsl::narrow_cast<size_t>(tensorDesc->totalTensorSizeInBytes));
                                 auto data = static_cast<const uint8_t*>(constantInput->GetData());
                                 std::vector<uint8_t> tensorData(data, data + minimumConstantSize);
 

From a8bb1df331e56e3a65106578b6475d89d17b27c5 Mon Sep 17 00:00:00 2001
From: Guenther Schmuelling <guschmue@microsoft.com>
Date: Mon, 8 Jan 2024 17:58:38 -0800
Subject: [PATCH 03/29] [js/webgpu] fix heap access > 2GB (#19010)

---
 onnxruntime/core/providers/js/js_kernel.h     |  1 +
 .../core/providers/js/operators/conv.h        | 12 ++---
 .../providers/js/operators/conv_transpose.h   | 20 ++++----
 onnxruntime/core/providers/js/operators/pad.h |  2 +-
 .../core/providers/js/operators/reduce.h      | 46 +++++++++----------
 .../core/providers/js/operators/resize.h      |  2 +-
 .../core/providers/js/operators/slice.h       |  6 +--
 .../core/providers/js/operators/split.h       |  2 +-
 .../core/providers/js/operators/transpose.h   |  2 +-
 9 files changed, 47 insertions(+), 46 deletions(-)

diff --git a/onnxruntime/core/providers/js/js_kernel.h b/onnxruntime/core/providers/js/js_kernel.h
index 5c2d1f0b881ba..b850bea4bc275 100644
--- a/onnxruntime/core/providers/js/js_kernel.h
+++ b/onnxruntime/core/providers/js/js_kernel.h
@@ -67,6 +67,7 @@ namespace js {
                             float value;                                           \
                             ORT_ENFORCE(info.GetAttr<float>(#attr_name, &value));, \
                                                                                  , ({#attr_name : $1}), static_cast<double>(value))
+#define JSEP_HEAP_PTR(ptr) reinterpret_cast<uintptr_t>(ptr)
 
 // TODO:
 // class JsMultiProgramKernel : public OpKernel { /* TBD */ };
diff --git a/onnxruntime/core/providers/js/operators/conv.h b/onnxruntime/core/providers/js/operators/conv.h
index 5c0fbf93a4004..98a530c6b77f6 100644
--- a/onnxruntime/core/providers/js/operators/conv.h
+++ b/onnxruntime/core/providers/js/operators/conv.h
@@ -54,13 +54,13 @@ class ConvBase : public JsKernel {
                                  static_cast<int32_t>(conv_attrs_.group),
                                  static_cast<int32_t>(kernel_shape_0),
                                  static_cast<int32_t>(local_pads.size()),
-                                 reinterpret_cast<int32_t>(local_pads.size() > 0 ? local_pads.data() : nullptr) >> 2,
+                                 JSEP_HEAP_PTR(local_pads.size() > 0 ? local_pads.data() : nullptr) >> 2,
                                  static_cast<int32_t>(conv_attrs_.strides.size() > 0 ? conv_attrs_.strides[0] : 0),
                                  static_cast<int32_t>(channels_last),
-                                 reinterpret_cast<int32_t>(&w_is_const_),
+                                 JSEP_HEAP_PTR(&w_is_const_),
                                  conv_attrs_.activation.c_str(),
                                  activation_params.size(),
-                                 reinterpret_cast<int32_t>(activation_params_ptr) >> 2);
+                                 JSEP_HEAP_PTR(activation_params_ptr) >> 2);
     } else {
       JSEP_INIT_KERNEL_ATTRIBUTE(Conv, ({
                                    "format" : $11 ? "NHWC" : "NCHW",
@@ -81,14 +81,14 @@ class ConvBase : public JsKernel {
                                  static_cast<int32_t>(kernel_shape_0),
                                  static_cast<int32_t>(kernel_shape_1),
                                  static_cast<int32_t>(local_pads.size()),
-                                 reinterpret_cast<int32_t>(local_pads.size() > 0 ? local_pads.data() : nullptr) >> 2,
+                                 JSEP_HEAP_PTR(local_pads.size() > 0 ? local_pads.data() : nullptr) >> 2,
                                  static_cast<int32_t>(conv_attrs_.strides.size() > 0 ? conv_attrs_.strides[0] : 0),
                                  static_cast<int32_t>(conv_attrs_.strides.size() > 1 ? conv_attrs_.strides[1] : 0),
                                  static_cast<int32_t>(channels_last),
-                                 reinterpret_cast<int32_t>(&w_is_const_),
+                                 JSEP_HEAP_PTR(&w_is_const_),
                                  conv_attrs_.activation.c_str(),
                                  activation_params.size(),
-                                 reinterpret_cast<int32_t>(activation_params_ptr) >> 2);
+                                 JSEP_HEAP_PTR(activation_params_ptr) >> 2);
     }
   }
 
diff --git a/onnxruntime/core/providers/js/operators/conv_transpose.h b/onnxruntime/core/providers/js/operators/conv_transpose.h
index 5d30dc851e00f..353a946e95c21 100644
--- a/onnxruntime/core/providers/js/operators/conv_transpose.h
+++ b/onnxruntime/core/providers/js/operators/conv_transpose.h
@@ -64,11 +64,11 @@ class ConvTranspose : public JsKernel {
                                  static_cast<int32_t>(pads_1),
                                  static_cast<int32_t>(strides),
                                  static_cast<int32_t>(channels_last),
-                                 reinterpret_cast<int32_t>(&w_is_const_),
+                                 JSEP_HEAP_PTR(&w_is_const_),
                                  gsl::narrow_cast<int32_t>(local_output_padding.size()),
-                                 reinterpret_cast<int32_t>(local_output_padding_ptr) >> 2,
+                                 JSEP_HEAP_PTR(local_output_padding_ptr) >> 2,
                                  gsl::narrow_cast<int32_t>(local_output_shape.size()),
-                                 reinterpret_cast<int32_t>(local_output_shape_ptr) >> 2,
+                                 JSEP_HEAP_PTR(local_output_shape_ptr) >> 2,
                                  conv_transpose_attrs_.activation.c_str());
     } else {
       constexpr size_t pads_vec_size = 4;
@@ -114,17 +114,17 @@ class ConvTranspose : public JsKernel {
                                    "activation" : UTF8ToString($13)
                                  }),
                                  static_cast<int32_t>(conv_transpose_attrs_.auto_pad),
-                                 reinterpret_cast<int32_t>(local_dilations.data()) >> 2,
+                                 JSEP_HEAP_PTR(local_dilations.data()) >> 2,
                                  static_cast<int32_t>(conv_transpose_attrs_.group),
-                                 reinterpret_cast<int32_t>(local_kernel_shape.data()) >> 2,
-                                 reinterpret_cast<int32_t>(local_pads.data()) >> 2,
-                                 reinterpret_cast<int32_t>(local_strides.data()) >> 2,
+                                 JSEP_HEAP_PTR(local_kernel_shape.data()) >> 2,
+                                 JSEP_HEAP_PTR(local_pads.data()) >> 2,
+                                 JSEP_HEAP_PTR(local_strides.data()) >> 2,
                                  static_cast<int32_t>(channels_last),
-                                 reinterpret_cast<int32_t>(&w_is_const_),
+                                 JSEP_HEAP_PTR(&w_is_const_),
                                  gsl::narrow_cast<int32_t>(local_output_padding.size()),
-                                 reinterpret_cast<int32_t>(local_output_padding_ptr) >> 2,
+                                 JSEP_HEAP_PTR(local_output_padding_ptr) >> 2,
                                  gsl::narrow_cast<int32_t>(local_output_shape.size()),
-                                 reinterpret_cast<int32_t>(local_output_shape_ptr) >> 2,
+                                 JSEP_HEAP_PTR(local_output_shape_ptr) >> 2,
                                  conv_transpose_attrs_.activation.c_str());
     }
   }
diff --git a/onnxruntime/core/providers/js/operators/pad.h b/onnxruntime/core/providers/js/operators/pad.h
index 19168f40b4722..bf808be949cf8 100644
--- a/onnxruntime/core/providers/js/operators/pad.h
+++ b/onnxruntime/core/providers/js/operators/pad.h
@@ -26,7 +26,7 @@ class Pad : public JsKernel, public PadBase {
                                static_cast<int32_t>(mode_),
                                static_cast<double>(value_),
                                gsl::narrow_cast<int32_t>(pads.size()),
-                               reinterpret_cast<int32_t>((pads.size() > 0) ? pads.data() : nullptr) >> 2);
+                               JSEP_HEAP_PTR((pads.size() > 0) ? pads.data() : nullptr) >> 2);
   }
 };
 
diff --git a/onnxruntime/core/providers/js/operators/reduce.h b/onnxruntime/core/providers/js/operators/reduce.h
index a5a4aa834c2ca..95c4f2bec230d 100644
--- a/onnxruntime/core/providers/js/operators/reduce.h
+++ b/onnxruntime/core/providers/js/operators/reduce.h
@@ -8,29 +8,29 @@
 
 namespace onnxruntime {
 namespace js {
-#define JSEP_DEFINE_REDUCE_KERNEL(ReduceKernel)                                                              \
-  template <bool allow_multi_axes = true>                                                                    \
-  class ReduceKernel : public JsKernel, public ReduceKernelBase<allow_multi_axes> {                          \
-   public:                                                                                                   \
-    using ReduceKernelBase<allow_multi_axes>::axes_;                                                         \
-    using ReduceKernelBase<allow_multi_axes>::noop_with_empty_axes_;                                         \
-    using ReduceKernelBase<allow_multi_axes>::keepdims_;                                                     \
-    ReduceKernel(const OpKernelInfo& info) : JsKernel(info), ReduceKernelBase<allow_multi_axes>(info) {      \
-      std::vector<int32_t> axes(axes_.size());                                                               \
-      if (axes_.size() > 0) {                                                                                \
-        std::transform(axes_.begin(), axes_.end(), axes.begin(),                                             \
-                       [](int64_t axis) { return gsl::narrow_cast<int32_t>(axis); });                        \
-      }                                                                                                      \
-      JSEP_INIT_KERNEL_ATTRIBUTE(ReduceKernel, ({                                                            \
-                                   "keepDims" : !!$1,                                                        \
-                                   "noopWithEmptyAxes" : !!$2,                                               \
-                                   "axes" : $3 ? (Array.from(HEAP32.subarray($4, $4 + $3))) : [],            \
-                                 }),                                                                         \
-                                 static_cast<int32_t>(keepdims_),                                            \
-                                 static_cast<int32_t>(noop_with_empty_axes_),                                \
-                                 gsl::narrow_cast<int32_t>(axes.size()),                                     \
-                                 reinterpret_cast<int32_t>((axes.size() > 0) ? axes.data() : nullptr) >> 2); \
-    }                                                                                                        \
+#define JSEP_DEFINE_REDUCE_KERNEL(ReduceKernel)                                                         \
+  template <bool allow_multi_axes = true>                                                               \
+  class ReduceKernel : public JsKernel, public ReduceKernelBase<allow_multi_axes> {                     \
+   public:                                                                                              \
+    using ReduceKernelBase<allow_multi_axes>::axes_;                                                    \
+    using ReduceKernelBase<allow_multi_axes>::noop_with_empty_axes_;                                    \
+    using ReduceKernelBase<allow_multi_axes>::keepdims_;                                                \
+    ReduceKernel(const OpKernelInfo& info) : JsKernel(info), ReduceKernelBase<allow_multi_axes>(info) { \
+      std::vector<int32_t> axes(axes_.size());                                                          \
+      if (axes_.size() > 0) {                                                                           \
+        std::transform(axes_.begin(), axes_.end(), axes.begin(),                                        \
+                       [](int64_t axis) { return gsl::narrow_cast<int32_t>(axis); });                   \
+      }                                                                                                 \
+      JSEP_INIT_KERNEL_ATTRIBUTE(ReduceKernel, ({                                                       \
+                                   "keepDims" : !!$1,                                                   \
+                                   "noopWithEmptyAxes" : !!$2,                                          \
+                                   "axes" : $3 ? (Array.from(HEAP32.subarray($4, $4 + $3))) : [],       \
+                                 }),                                                                    \
+                                 static_cast<int32_t>(keepdims_),                                       \
+                                 static_cast<int32_t>(noop_with_empty_axes_),                           \
+                                 gsl::narrow_cast<int32_t>(axes.size()),                                \
+                                 JSEP_HEAP_PTR((axes.size() > 0) ? axes.data() : nullptr) >> 2);        \
+    }                                                                                                   \
   };
 
 JSEP_DEFINE_REDUCE_KERNEL(ReduceMax);
diff --git a/onnxruntime/core/providers/js/operators/resize.h b/onnxruntime/core/providers/js/operators/resize.h
index 65854222ba988..4b1c288ae3015 100644
--- a/onnxruntime/core/providers/js/operators/resize.h
+++ b/onnxruntime/core/providers/js/operators/resize.h
@@ -34,7 +34,7 @@ class Resize : public JsKernel, public UpsampleBase {
                                }),
                                static_cast<int32_t>(antialias_),
                                gsl::narrow_cast<int32_t>(axes.size()),
-                               reinterpret_cast<int32_t>((axes.size() > 0) ? axes.data() : nullptr) >> 2,
+                               JSEP_HEAP_PTR((axes.size() > 0) ? axes.data() : nullptr) >> 2,
                                resize_coordinate_transformation_mode.c_str(),
                                static_cast<double>(cubic_coeff_a_),
                                static_cast<int32_t>(exclude_outside_),
diff --git a/onnxruntime/core/providers/js/operators/slice.h b/onnxruntime/core/providers/js/operators/slice.h
index 6792997025d65..989adabf029a5 100644
--- a/onnxruntime/core/providers/js/operators/slice.h
+++ b/onnxruntime/core/providers/js/operators/slice.h
@@ -24,11 +24,11 @@ class Slice : public JsKernel, public SliceBase {
                                         "ends" : $3 ? Array.from(HEAP32.subarray($4, $4 + $3)) : [],
                                         "axes" : $5 ? Array.from(HEAP32.subarray($6, $6 + $5)) : []}),
                                gsl::narrow_cast<int32_t>(starts.size()),
-                               reinterpret_cast<int32_t>((starts.size() > 0) ? starts.data() : nullptr) >> 2,
+                               JSEP_HEAP_PTR((starts.size() > 0) ? starts.data() : nullptr) >> 2,
                                gsl::narrow_cast<int32_t>(ends.size()),
-                               reinterpret_cast<int32_t>((ends.size() > 0) ? ends.data() : nullptr) >> 2,
+                               JSEP_HEAP_PTR((ends.size() > 0) ? ends.data() : nullptr) >> 2,
                                gsl::narrow_cast<int32_t>(axes.size()),
-                               reinterpret_cast<int32_t>((axes.size() > 0) ? axes.data() : nullptr) >> 2);
+                               JSEP_HEAP_PTR((axes.size() > 0) ? axes.data() : nullptr) >> 2);
   }
 };
 
diff --git a/onnxruntime/core/providers/js/operators/split.h b/onnxruntime/core/providers/js/operators/split.h
index cfacc1aa6a363..1c1874e5aa98e 100644
--- a/onnxruntime/core/providers/js/operators/split.h
+++ b/onnxruntime/core/providers/js/operators/split.h
@@ -53,7 +53,7 @@ class Split : public JsKernel, public SplitBase {
                                static_cast<int32_t>(axis_),
                                static_cast<int32_t>(num_outputs_),
                                gsl::narrow_cast<int32_t>(split_sizes.size()),
-                               reinterpret_cast<int32_t>((split_sizes.size() > 0) ? split_sizes.data() : nullptr) >> 2);
+                               JSEP_HEAP_PTR((split_sizes.size() > 0) ? split_sizes.data() : nullptr) >> 2);
   }
 };
 
diff --git a/onnxruntime/core/providers/js/operators/transpose.h b/onnxruntime/core/providers/js/operators/transpose.h
index 311badbde0d11..dae442b9f5a13 100644
--- a/onnxruntime/core/providers/js/operators/transpose.h
+++ b/onnxruntime/core/providers/js/operators/transpose.h
@@ -27,7 +27,7 @@ class Transpose final : public JsKernel, public TransposeBase {
                                gsl::narrow_cast<int32_t>(perm_specified_ ? perm_.size() : 0),
                                // $2: index to HEAP32 of the first int32 element. calculated from right shift memory
                                //     address by 2
-                               reinterpret_cast<int32_t>(perm_specified_ && !perm.empty() ? perm.data() : nullptr) >> 2);
+                               JSEP_HEAP_PTR(perm_specified_ && !perm.empty() ? perm.data() : nullptr) >> 2);
   }
 };
 

From 8f024b739439c521cd19fe5a9830d4015de99bd7 Mon Sep 17 00:00:00 2001
From: Xu Xing <xing.xu@intel.com>
Date: Tue, 9 Jan 2024 10:16:25 +0800
Subject: [PATCH 04/29] [js/webgpu] Support uniforms for layer-norm (#18755)

---
 .../lib/wasm/jsep/webgpu/op-resolve-rules.ts  |  4 +-
 js/web/lib/wasm/jsep/webgpu/ops/layer-norm.ts | 83 ++++++++++---------
 2 files changed, 46 insertions(+), 41 deletions(-)

diff --git a/js/web/lib/wasm/jsep/webgpu/op-resolve-rules.ts b/js/web/lib/wasm/jsep/webgpu/op-resolve-rules.ts
index 8e1ec782079be..06c3c6c196501 100644
--- a/js/web/lib/wasm/jsep/webgpu/op-resolve-rules.ts
+++ b/js/web/lib/wasm/jsep/webgpu/op-resolve-rules.ts
@@ -17,7 +17,7 @@ import {gather, parseGatherAttributes} from './ops/gather';
 import {gatherElements, parseGatherElementsAttributes} from './ops/gather-elements';
 import {gemm, parseGemmAttributes} from './ops/gemm';
 import {instanceNorm, parseInstanceNormAttributes} from './ops/instance-norm';
-import {layerNorm, parseLayerNormAttributes} from './ops/layer-norm';
+import {layerNorm} from './ops/layer-norm';
 import {matMul} from './ops/matmul';
 import {multiHeadAttention, parseMultiHeadAttentionAttributes} from './ops/multi-head-attentiion';
 import {pad, parsePadAttributes} from './ops/pad';
@@ -83,7 +83,7 @@ export const WEBGPU_OP_RESOLVE_RULES: Map<string, OperatorImplementation> = new
   ['Greater', [binaryOps.greater]],
   ['GreaterOrEqual', [binaryOps.greaterOrEqual]],
   ['InstanceNormalization', [instanceNorm, parseInstanceNormAttributes]],
-  ['LayerNormalization', [layerNorm, parseLayerNormAttributes]],
+  ['LayerNormalization', [layerNorm]],
   ['LeakyRelu', [unaryOps.leakyRelu, unaryOps.parseAlphaAttributes]],
   ['Less', [binaryOps.less]],
   ['LessOrEqual', [binaryOps.lessOrEqual]],
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/layer-norm.ts b/js/web/lib/wasm/jsep/webgpu/ops/layer-norm.ts
index 8a9eeecf2c68d..bc446079faf8f 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/layer-norm.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/layer-norm.ts
@@ -4,12 +4,11 @@
 import {DataType} from '../../../wasm-common';
 import {TensorView} from '../../tensor-view';
 import {ShapeUtil} from '../../util';
-import {AttributeWithCacheKey, createAttributeWithCacheKey} from '../attribute-with-cache-key';
-import {ComputeContext, ProgramInfo} from '../types';
+import {ComputeContext, ProgramInfo, ProgramInputTensorInfoDependency, ProgramUniform} from '../types';
 
-import {castToF32, fillVector, getMaxComponents, inputVariable, outputVariable, ShaderHelper, sumVector, tensorTypeToWsglStorageType,} from './common';
+import {castToF32, fillVector, getMaxComponents, inputVariable, outputVariable, ShaderHelper, sumVector, tensorTypeToWsglStorageType, UniformsArrayType,} from './common';
 
-export interface LayerNormAttributes extends AttributeWithCacheKey {
+interface LayerNormAttributes {
   axis: number;
   epsilon: number;
 }
@@ -39,7 +38,7 @@ const createLayerNormProgramInfo =
        Got scale size of ${scaleSize} and bias size of ${biasSize}`);
       }
 
-      const meanInvStdDevDim = [];
+      const meanInvStdDevDim: number[] = [];
       for (let i = 0; i < xShape.length; ++i) {
         if (i < axis) {
           meanInvStdDevDim.push(xShape[i]);
@@ -47,50 +46,57 @@ const createLayerNormProgramInfo =
           meanInvStdDevDim.push(1);
         }
       }
-
       const components = getMaxComponents(normSize);
-      const dataType = tensorTypeToWsglStorageType(inputs[0].dataType);
-      const variables = [
-        inputVariable('x', inputs[0].dataType, inputs[0].dims, components),
-        inputVariable('scale', scale.dataType, scale.dims, components),
+      const inputDependencies: ProgramInputTensorInfoDependency[] = ['type', 'type'];
+      const programUniforms: ProgramUniform[] = [
+        {type: 'uint32', data: normCount}, {type: 'float32', data: normSize},
+        {type: 'uint32', data: Math.floor(normSize / components)}, {type: 'float32', data: attributes.epsilon}
       ];
       if (bias) {
-        variables.push(inputVariable('bias', bias.dataType, bias.dims, components));
+        inputDependencies.push('type');
       }
-      variables.push(outputVariable('output', inputs[0].dataType, outputShape, components));
-
       const hasMeanDataOutput = outputCount > 1;
       const hasInvStdOutput = outputCount > 2;
 
-      if (hasMeanDataOutput) {
-        variables.push(outputVariable('meanDataOutput', DataType.float, meanInvStdDevDim));
-      }
-      if (hasInvStdOutput) {
-        variables.push(outputVariable('invStdOutput', DataType.float, meanInvStdDevDim));
-      }
-
-      const getShaderSource = (shaderHelper: ShaderHelper) => `
-  const normSize: f32 = ${normSize};
-  const normSizeVectorized: u32 = ${normSize / components};
-  const epsilon: f32 = ${attributes.epsilon};
+      const getShaderSource = (shaderHelper: ShaderHelper) => {
+        const dataType = tensorTypeToWsglStorageType(inputs[0].dataType);
+        const variables = [
+          inputVariable('x', inputs[0].dataType, inputs[0].dims, components),
+          inputVariable('scale', scale.dataType, scale.dims, components),
+        ];
+        if (bias) {
+          variables.push(inputVariable('bias', bias.dataType, bias.dims, components));
+        }
+        variables.push(outputVariable('output', inputs[0].dataType, outputShape, components));
+        if (hasMeanDataOutput) {
+          variables.push(outputVariable('mean_data_output', DataType.float, meanInvStdDevDim));
+        }
+        if (hasInvStdOutput) {
+          variables.push(outputVariable('inv_std_output', DataType.float, meanInvStdDevDim));
+        }
 
-  ${shaderHelper.declareVariables(...variables)}
+        const uniforms: UniformsArrayType = [
+          {name: 'norm_count', type: 'u32'}, {name: 'norm_size', type: 'f32'},
+          {name: 'norm_size_vectorized', type: 'u32'}, {name: 'epsilon', type: 'f32'}
+        ];
+        return `
+  ${shaderHelper.registerUniforms(uniforms).declareVariables(...variables)}
   ${shaderHelper.mainStart()}
-    ${shaderHelper.guardAgainstOutOfBoundsWorkgroupSizes(normCount)}
-    let offset = global_idx * normSizeVectorized;
+    ${shaderHelper.guardAgainstOutOfBoundsWorkgroupSizes('uniforms.norm_count')}
+    let offset = global_idx * uniforms.norm_size_vectorized;
     var meanVector = ${fillVector('f32', components)};
     var meanSquareVector = ${fillVector('f32', components)};
 
-    for (var h: u32 = 0u; h < normSizeVectorized; h++) {
+    for (var h: u32 = 0u; h < uniforms.norm_size_vectorized; h++) {
       let value = ${castToF32(dataType, components, 'x[h + offset]')};
       meanVector += value;
       meanSquareVector += value * value;
     }
-    let mean = ${sumVector('meanVector', components)} / normSize;
-    let meanSquare = sqrt(${sumVector('meanSquareVector', components)} 
-      / normSize - mean * mean + epsilon);
+    let mean = ${sumVector('meanVector', components)} / uniforms.norm_size;
+    let meanSquare = sqrt(${sumVector('meanSquareVector', components)}
+      / uniforms.norm_size - mean * mean + uniforms.epsilon);
 
-    for (var j: u32 = 0; j < normSizeVectorized; j++) {
+    for (var j: u32 = 0; j < uniforms.norm_size_vectorized; j++) {
       let f32input = ${castToF32(dataType, components, 'x[j + offset]')};
       let f32scale = ${castToF32(dataType, components, 'scale[j]')};
       output[j + offset] = ${variables[0].type.value}((f32input - mean) / meanSquare * f32scale
@@ -98,9 +104,10 @@ const createLayerNormProgramInfo =
       );
     }
 
-    ${hasMeanDataOutput ? 'meanDataOutput[global_idx] = mean' : ''};
-    ${hasInvStdOutput ? 'invStdOutput[global_idx] = 1 / meanSquare' : ''};
+    ${hasMeanDataOutput ? 'mean_data_output[global_idx] = mean' : ''};
+    ${hasInvStdOutput ? 'inv_std_output[global_idx] = 1 / meanSquare' : ''};
   }`;
+      };
       const outputs = [{dims: outputShape, dataType: inputs[0].dataType}];
       if (hasMeanDataOutput) {
         outputs.push({dims: meanInvStdDevDim, dataType: DataType.float});
@@ -111,15 +118,13 @@ const createLayerNormProgramInfo =
 
       return {
         name: 'LayerNormalization',
-        shaderCache: {hint: `${attributes.cacheKey}|${outputCount}|${inputs.length}`},
-        getRunData: () => ({outputs, dispatchGroup: {x: Math.ceil(normCount / 64 /* workgroup size */)}}),
+        shaderCache: {hint: `${components};${outputCount}`, inputDependencies},
+        getRunData: () =>
+            ({outputs, dispatchGroup: {x: Math.ceil(normCount / 64 /* workgroup size */)}, programUniforms}),
         getShaderSource,
       };
     };
 
-export const parseLayerNormAttributes = (attributes: LayerNormAttributes): LayerNormAttributes =>
-    createAttributeWithCacheKey({axis: attributes.axis, epsilon: attributes.epsilon});
-
 export const layerNorm = (context: ComputeContext, attributes: LayerNormAttributes): void => {
   validateInputs(context.inputs);
   context.compute(createLayerNormProgramInfo(context.inputs, attributes, context.outputCount));

From 68c29ece23821b1d2b73ac55c2a4266c72865219 Mon Sep 17 00:00:00 2001
From: Changming Sun <chasun@microsoft.com>
Date: Mon, 8 Jan 2024 19:46:33 -0800
Subject: [PATCH 05/29] In a Linux or Android build check if the compiler
 support bfloat16 and float16 (#18813)

### Description
Restrict clang version because we have an upcoming change that requires
clang version >=16 , which will mainly affect Android build.
---
 cmake/CMakeLists.txt                    | 27 +++++++++++++++++--------
 cmake/adjust_global_compile_flags.cmake | 25 +++++++++++++++++++++++
 2 files changed, 44 insertions(+), 8 deletions(-)

diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt
index 34355fb0fd936..0f57258dca706 100644
--- a/cmake/CMakeLists.txt
+++ b/cmake/CMakeLists.txt
@@ -354,13 +354,7 @@ if (onnxruntime_USE_ROCM)
   endif()
 endif()
 
-if (APPLE)
-    if (NOT CMAKE_OSX_ARCHITECTURES)
-        message("Building ONNX Runtime for ${CMAKE_HOST_SYSTEM_PROCESSOR}")
-    endif()
-elseif (NOT WIN32 AND NOT APPLE)
-    message("Building ONNX Runtime for ${CMAKE_SYSTEM_PROCESSOR}")
-endif()
+
 
 # Single output director for all binaries
 set(RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin CACHE PATH "Single output directory for all binaries.")
@@ -493,6 +487,14 @@ endif()
 
 include(adjust_global_compile_flags.cmake)
 
+if (APPLE)
+  if (NOT CMAKE_OSX_ARCHITECTURES)
+    message("Building ONNX Runtime for ${CMAKE_HOST_SYSTEM_PROCESSOR} CPU ARCH")
+  endif()
+elseif (NOT WIN32 AND NOT APPLE)
+  message("Building ONNX Runtime for ${onnxruntime_target_platform} CPU ARCH")
+endif()
+
 # We need to link with libatomic on systems that do not have built-in atomics, or
 # don't have built-in support for 8 byte atomics
 # Derived from https://github.com/protocolbuffers/protobuf/blob/master/cmake/CMakeLists.txt
@@ -639,7 +641,16 @@ else()
   check_cxx_compiler_flag(-Wunused-variable HAS_UNUSED_VARIABLE)
   check_cxx_compiler_flag(-Wuseless-cast HAS_USELESS_CAST)
   check_function_exists(reallocarray HAS_REALLOCARRAY)
-
+  if (NOT APPLE AND NOT CMAKE_SYSTEM_NAME STREQUAL "Emscripten" AND onnxruntime_target_platform STREQUAL "aarch64")
+   check_cxx_compiler_flag(-march=armv8.2-a+bf16 HAS_ARM64_BFLOAT16)
+   if(NOT HAS_ARM64_BFLOAT16)
+     message(FATAL_ERROR  "The compiler doesn't support BFLOAT16!!!")
+   endif()
+   check_cxx_compiler_flag(-march=armv8.2-a+fp16 HAS_ARM64_FLOAT16)
+   if(NOT HAS_ARM64_FLOAT16)
+     message(FATAL_ERROR  "The compiler doesn't support FLOAT16!!!")
+   endif()
+  endif()
   if (HAS_TAUTOLOGICAL_POINTER_COMPARE)
     #we may have extra null pointer checkings in debug build, it's not an issue
     list(APPEND ORT_WARNING_FLAGS -Wno-tautological-pointer-compare)
diff --git a/cmake/adjust_global_compile_flags.cmake b/cmake/adjust_global_compile_flags.cmake
index e825bfeaea952..9f00c873715f4 100644
--- a/cmake/adjust_global_compile_flags.cmake
+++ b/cmake/adjust_global_compile_flags.cmake
@@ -300,6 +300,31 @@ if (MSVC)
   endif()
 else()
   if (NOT APPLE)
+    #XXX: Sometimes the value of CMAKE_SYSTEM_PROCESSOR is set but it's wrong. For example, if you run an armv7 docker
+    #image on an aarch64 machine with an aarch64 Ubuntu host OS, in the docker instance cmake may still report
+    # CMAKE_SYSTEM_PROCESSOR as aarch64 by default. Given compiling this code may need more than 2GB memory, we do not
+    # support compiling for ARM32 natively(only support cross-compiling), we will ignore this issue for now.
+    if(NOT CMAKE_SYSTEM_PROCESSOR)
+      message(WARNING "CMAKE_SYSTEM_PROCESSOR is not set. Please set it in your toolchain cmake file.")
+      # Try to detect it
+      if("${CMAKE_C_COMPILER_ID}" STREQUAL "GNU" OR "${CMAKE_C_COMPILER_ID}" STREQUAL "Clang")
+        execute_process(
+		COMMAND "${CMAKE_C_COMPILER}" -dumpmachine
+		OUTPUT_VARIABLE GCC_DUMP_MACHINE_OUT OUTPUT_STRIP_TRAILING_WHITESPACE
+		ERROR_VARIABLE _err
+		RESULT_VARIABLE _res
+		)
+		if(NOT _res EQUAL 0)
+			message(SEND_ERROR "Failed to run 'gcc -dumpmachine':\n ${_res}")
+		endif()
+		string(REPLACE "-" ";" GCC_DUMP_MACHINE_OUT_LIST "${GCC_DUMP_MACHINE_OUT}")
+		list(LENGTH GCC_DUMP_MACHINE_OUT_LIST GCC_TRIPLET_LEN)
+		if(GCC_TRIPLET_LEN EQUAL 4)
+		  list(GET GCC_DUMP_MACHINE_OUT_LIST 0 CMAKE_SYSTEM_PROCESSOR)
+          message("Setting CMAKE_SYSTEM_PROCESSOR to ${CMAKE_SYSTEM_PROCESSOR}")
+        endif()
+      endif()
+    endif()
     set(onnxruntime_target_platform ${CMAKE_SYSTEM_PROCESSOR})
   endif()
   if (onnxruntime_BUILD_FOR_NATIVE_MACHINE)

From eb35896ede6e77bf4a453b9e7314e728e40f96ba Mon Sep 17 00:00:00 2001
From: zesongw <zesong.wang@intel.com>
Date: Tue, 9 Jan 2024 14:02:44 +0800
Subject: [PATCH 06/29] [WebNN EP] Update WebNN normalization ops (#18817)

Use batchNormalization, layerNormalization and instanceNormalization
instead of meanVarianceNormalization to implement normalization Ops. The
spec of meanVarianceNormalization has been deleted.
Remove groupNormalization.
---
 .../core/providers/webnn/builders/helper.h    |   7 +-
 .../builders/impl/normalization_op_builder.cc | 141 +++++++-----------
 .../webnn/builders/op_builder_factory.cc      |   1 -
 3 files changed, 57 insertions(+), 92 deletions(-)

diff --git a/onnxruntime/core/providers/webnn/builders/helper.h b/onnxruntime/core/providers/webnn/builders/helper.h
index 8b8b85339a87c..5aec81af15761 100644
--- a/onnxruntime/core/providers/webnn/builders/helper.h
+++ b/onnxruntime/core/providers/webnn/builders/helper.h
@@ -139,7 +139,7 @@ static const InlinedHashMap<std::string, WebnnOpInfo> op_map = {
     {"ArgMax", {"argMax", false}},
     {"ArgMin", {"argMin", false}},
     {"AveragePool", {"averagePool2d", true}},
-    {"BatchNormalization", {"meanVarianceNormalization", false}},
+    {"BatchNormalization", {"batchNormalization", false}},
     {"Cast", {"cast", false}},
     {"Ceil", {"ceil", true}},
     {"Clip", {"clamp", true}},
@@ -162,12 +162,11 @@ static const InlinedHashMap<std::string, WebnnOpInfo> op_map = {
     {"GlobalLpPool", {"l2Pool2d", false}},
     {"Greater", {"greater", false}},
     {"GreaterOrEqual", {"greaterOrEqual", false}},
-    {"GroupNormalization", {"meanVarianceNormalization", false}},
     {"HardSigmoid", {"hardSigmoid", false}},
     {"HardSwish", {"hardSwish", true}},
     {"Identity", {"identity", false}},
-    {"InstanceNormalization", {"meanVarianceNormalization", false}},
-    {"LayerNormalization", {"meanVarianceNormalization", false}},
+    {"InstanceNormalization", {"instanceNormalization", false}},
+    {"LayerNormalization", {"layerNormalization", false}},
     {"LeakyRelu", {"leakyRelu", true}},
     {"Less", {"lesser", false}},
     {"LessOrEqual", {"lesserOrEqual", false}},
diff --git a/onnxruntime/core/providers/webnn/builders/impl/normalization_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/normalization_op_builder.cc
index 756a838cc0c3e..4d2470dfe7deb 100644
--- a/onnxruntime/core/providers/webnn/builders/impl/normalization_op_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/impl/normalization_op_builder.cc
@@ -27,8 +27,6 @@ class NormalizationOpBuilder : public BaseOpBuilder {
                          const WebnnDeviceType /* device_type */, const logging::Logger& logger) const override;
 };
 
-// All normalization are based on layout NCHW.
-// TODO: add support for NHWC.
 Status NormalizationOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder,
                                                      const Node& node,
                                                      const logging::Logger& logger) const {
@@ -61,49 +59,13 @@ Status NormalizationOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder
     ORT_RETURN_IF_NOT(bias_shape == scale_shape, "The bias' shape should be equal to scale's shape.");
   }
 
-  std::vector<uint32_t> new_scale_shape;
-  if (scale_size < rank) {
-    if (op_type == "BatchNormalization") {
-      scale_shape.insert(scale_shape.begin(), 1);
-      scale_shape.insert(scale_shape.end(), rank - 2, 1);
-    } else if (op_type == "LayerNormalization") {
-      // Align right with leading ones.
-      scale_shape.insert(scale_shape.begin(), rank - scale_size, 1);
-    } else if (op_type == "InstanceNormalization") {
-      // Insert ones before and after the channel dimension.
-      scale_shape.insert(scale_shape.begin(), 1);
-      ORT_RETURN_IF(scale_size != 1 || rank < 2,
-                    "The scale size should be 1 and rank should be at least 2 for InstanceNorm.");
-      scale_shape.insert(scale_shape.end(), rank - scale_size - 1, 1);
-    } else if (op_type == "GroupNormalization") {
-      // The input will be reshaped to 3D later. So just insert ones before the channel and after.
-      scale_shape.insert(scale_shape.begin(), 1);
-      scale_shape.insert(scale_shape.end(), 1);
-    } else {
-      return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Unsupported normalization op: ", op_type);
-    }
+  emscripten::val scale = model_builder.GetOperand(input_defs[1]->Name());
+  options.set("scale", scale);
 
-    std::transform(scale_shape.cbegin(), scale_shape.cend(),
-                   std::back_inserter(new_scale_shape),
-                   [](int64_t dim) -> uint32_t { return SafeInt<uint32_t>(dim); });
-    emscripten::val reshape_scale = model_builder.GetOperand(input_defs[1]->Name());
-    emscripten::val reshape_output_scale =
-        model_builder.GetBuilder().call<emscripten::val>("reshape", reshape_scale, emscripten::val::array(new_scale_shape));
-    options.set("scale", reshape_output_scale);
-
-    if (input_defs.size() >= 3 && !input_defs[2]->Name().empty()) {
-      // Bias input exists, and bias's shape is the same as scale's shape.
-      emscripten::val reshape_bias = model_builder.GetOperand(input_defs[2]->Name());
-      emscripten::val reshape_output_bias =
-          model_builder.GetBuilder().call<emscripten::val>("reshape", reshape_bias, emscripten::val::array(new_scale_shape));
-      options.set("bias", reshape_output_bias);
-    }
-  } else {
-    options.set("scale", model_builder.GetOperand(input_defs[1]->Name()));
-    if (input_defs.size() >= 3 && !input_defs[2]->Name().empty()) {
-      // Bias input exists, and bias's shape is the same as scale's shape.
-      options.set("bias", model_builder.GetOperand(input_defs[2]->Name()));
-    }
+  if (input_defs.size() >= 3 && !input_defs[2]->Name().empty()) {
+    // Bias input exists, and bias's shape is the same as scale's shape.
+    emscripten::val bias = model_builder.GetOperand(input_defs[2]->Name());
+    options.set("bias", bias);
   }
 
   NodeAttrHelper helper(node);
@@ -114,56 +76,62 @@ Status NormalizationOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder
     ORT_RETURN_IF_NOT(input_defs.size() == 5, "BatchNormalization requires five inputs.");
     emscripten::val mean = model_builder.GetOperand(input_defs[3]->Name());
     emscripten::val variance = model_builder.GetOperand(input_defs[4]->Name());
-    // Enlarge 1-D mean and variance to new scale shape.
-    emscripten::val reshape_mean =
-        model_builder.GetBuilder().call<emscripten::val>("reshape", mean, emscripten::val::array(new_scale_shape));
-    emscripten::val reshape_variance =
-        model_builder.GetBuilder().call<emscripten::val>("reshape", variance, emscripten::val::array(new_scale_shape));
-
-    std::vector<uint32_t> axes = {0};
-    for (uint32_t i = 2; i < rank; i++) {
-      axes.push_back(i);
+    if (model_builder.GetPreferredLayout() == DataLayout::NHWC) {
+      options.set("axis", rank - 1);
     }
-
-    options.set("axes", emscripten::val::array(axes));
-    options.set("mean", reshape_mean);
-    options.set("variance", reshape_variance);
-    output = model_builder.GetBuilder().call<emscripten::val>("meanVarianceNormalization", input, options);
+    output = model_builder.GetBuilder().call<emscripten::val>("batchNormalization", input, mean, variance, options);
   } else if (op_type == "LayerNormalization") {
     int64_t axis = helper.Get("axis", -1);
     axis = HandleNegativeAxis(axis, rank);
     std::vector<uint32_t> axes(rank - SafeInt<uint32_t>(axis));
-    std::iota(axes.begin(), axes.end(), axis);
+    if (model_builder.GetPreferredLayout() == DataLayout::NHWC && axis > 1) {
+      std::iota(axes.begin(), axes.end(), axis - 1);
+    } else {
+      std::iota(axes.begin(), axes.end(), axis);
+    }
     options.set("axes", emscripten::val::array(axes));
-    output = model_builder.GetBuilder().call<emscripten::val>("meanVarianceNormalization", input, options);
+    output = model_builder.GetBuilder().call<emscripten::val>("layerNormalization", input, options);
   } else if (op_type == "InstanceNormalization") {
-    std::vector<uint32_t> axes;
-    for (uint32_t i = 2; i < rank; i++) {
-      axes.emplace_back(i);
+    // WebNN spec only supports 4D input for instanceNormalization.
+    // Supports 3D input by prepending 1 size dimension.
+    // For models with dimensions greater than 4, they will be reshaped into 4D.
+    constexpr size_t webnn_shape_rank = 4;
+    if (input_shape.size() != webnn_shape_rank) {
+      std::vector<uint32_t> new_shape;
+      new_shape.reserve(std::max(input_shape.size(), webnn_shape_rank));
+      std::transform(input_shape.begin(), input_shape.end(),
+                     std::back_inserter(new_shape),
+                     [](int64_t dim) -> uint32_t { return SafeInt<uint32_t>(dim); });
+
+      size_t insertion_offset = (model_builder.GetPreferredLayout() == DataLayout::NHWC) ? 2 : 3;
+      ptrdiff_t excess_rank = new_shape.size() - webnn_shape_rank;
+      auto insertion_point = new_shape.begin() + insertion_offset;
+      if (input_shape.size() < webnn_shape_rank) {
+        // Pad the shape with extra 1's to satisfy WebNN v1's rank requirements.
+        new_shape.insert(insertion_point, -excess_rank, 1);
+      } else {
+        // Fold the extra range to fit within WebNN v1's rank requirements.
+        uint32_t sum = std::accumulate(
+            insertion_point, insertion_point + excess_rank + 1, 1, std::multiplies<uint32_t>());
+        new_shape.erase(insertion_point, insertion_point + excess_rank);
+        *insertion_point = sum;
+      }
+      input = model_builder.GetBuilder().call<emscripten::val>("reshape", input, emscripten::val::array(new_shape));
+    }
+
+    if (model_builder.GetPreferredLayout() == DataLayout::NHWC) {
+      options.set("layout", emscripten::val("nhwc"));
+    }
+    output = model_builder.GetBuilder().call<emscripten::val>("instanceNormalization", input, options);
+    // Reshape back to the original output shape for 3D input.
+    if (input_shape.size() != 4) {
+      std::vector<uint32_t> output_shape;
+      std::transform(input_shape.begin(), input_shape.end(),
+                     std::back_inserter(output_shape),
+                     [](int64_t dim) -> uint32_t { return SafeInt<uint32_t>(dim); });
+      output = model_builder.GetBuilder().call<emscripten::val>(
+          "reshape", output, emscripten::val::array(output_shape));
     }
-    options.set("axes", emscripten::val::array(axes));
-    output = model_builder.GetBuilder().call<emscripten::val>("meanVarianceNormalization", input, options);
-  } else if (op_type == "GroupNormalization") {
-    ORT_RETURN_IF_NOT(helper.HasAttr("num_groups"), "GroupNormalization num_group must be provided.");
-    int32_t group_count = helper.Get("num_groups", -1);
-    std::vector<uint32_t> orig_shape, new_shape;
-    std::transform(input_shape.cbegin(), input_shape.cend(),
-                   std::back_inserter(orig_shape),
-                   [](int64_t dim) -> uint32_t { return SafeInt<uint32_t>(dim); });
-    // Add N and Group.
-    ORT_RETURN_IF_NOT(rank >= 2, "Input for GroupNormalization cannot be a scalar or 1D");
-    new_shape.emplace_back(SafeInt<uint32_t>(input_shape[0]));
-    new_shape.emplace_back(SafeInt<uint32_t>(group_count));
-
-    ORT_RETURN_IF_NOT(group_count > 0 && input_shape[1] % group_count == 0,
-                      "GroupNormalization num_group must be divisible by group.");
-    new_shape.emplace_back(SafeInt<uint32_t>(std::reduce(input_shape.begin() + 2, input_shape.end(),
-                                                         input_shape[1] / group_count, std::multiplies<int64_t>())));
-    // Input will be reshaped to (N, group count, channels per group x D1 x D2 ... Dn) and recovered after normalization.
-    options.set("axes", emscripten::val::array(std::vector<uint32_t>{2}));
-    output = model_builder.GetBuilder().call<emscripten::val>("reshape", input, emscripten::val::array(new_shape));
-    output = model_builder.GetBuilder().call<emscripten::val>("meanVarianceNormalization", output, options);
-    output = model_builder.GetBuilder().call<emscripten::val>("reshape", output, emscripten::val::array(orig_shape));
   } else {
     return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Unsupported normalization op: ", op_type);
   }
@@ -214,7 +182,6 @@ void CreateNormalizationOpBuilder(const std::string& op_type, OpBuilderRegistrat
   constexpr static std::string_view op_types[] =
       {
           "BatchNormalization",
-          "GroupNormalization",
           "InstanceNormalization",
           "LayerNormalization",
       };
diff --git a/onnxruntime/core/providers/webnn/builders/op_builder_factory.cc b/onnxruntime/core/providers/webnn/builders/op_builder_factory.cc
index 463317a4dafda..613771eda71fe 100644
--- a/onnxruntime/core/providers/webnn/builders/op_builder_factory.cc
+++ b/onnxruntime/core/providers/webnn/builders/op_builder_factory.cc
@@ -111,7 +111,6 @@ static OpBuilderRegistrations CreateOpBuilderRegistrations() {
 
   {  // Normalization
     CreateNormalizationOpBuilder("BatchNormalization", op_registrations);
-    CreateNormalizationOpBuilder("GroupNormalization", op_registrations);
     CreateNormalizationOpBuilder("InstanceNormalization", op_registrations);
     CreateNormalizationOpBuilder("LayerNormalization", op_registrations);
   }

From 7cb8b20db2d329cf67e170293b2d2c81213e6100 Mon Sep 17 00:00:00 2001
From: pengwa <pengwa@microsoft.com>
Date: Tue, 9 Jan 2024 20:05:34 +0800
Subject: [PATCH 07/29] Remove mem consuming test case to unblock running ci on
 lower-end gpu (#19059)

### Description
<!-- Describe your changes. -->


### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
---
 orttraining/orttraining/test/gradient/gradient_ops_test.cc | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/orttraining/orttraining/test/gradient/gradient_ops_test.cc b/orttraining/orttraining/test/gradient/gradient_ops_test.cc
index 6fb42dd59b6a0..feca94ae27c13 100644
--- a/orttraining/orttraining/test/gradient/gradient_ops_test.cc
+++ b/orttraining/orttraining/test/gradient/gradient_ops_test.cc
@@ -2218,12 +2218,6 @@ TEST(GradientUtilsTest, InPlaceAccumulatorV2_GPU) {
       {3072, 768},
       {4096, 768},
       {8192, 768},
-      {16384, 768},
-      {32768, 768},
-      {65536, 768},
-      {131072, 768},
-      {250002, 768},
-      {500004, 768},
   };
 
   for (const auto& test_dim : test_dims) {

From ab897a4a4064e8259ed952cbd33c35d3ddd5f370 Mon Sep 17 00:00:00 2001
From: Changming Sun <chasun@microsoft.com>
Date: Tue, 9 Jan 2024 07:45:03 -0800
Subject: [PATCH 08/29] Remove Windows ARM32 from nuget packaging pipelines
 (#19049)

### Description
1. Remove Windows ARM32 from nuget  packaging pipelines

2. Add missing component-governance-component-detection-steps.yml to
some build jobs.

### Motivation and Context
Stop supporting Windows ARM32 to align with [Windows's support
policy](https://learn.microsoft.com/en-us/windows/arm/arm32-to-arm64).
Users who need this feature still can build the DLLs from source.
However, later on we will remove that support too.
---
 ...anch.Nuget-WindowsAI-Pipeline.Official.yml | 66 +-------------
 .../c-api-noopenmp-packaging-pipelines.yml    | 85 +++++++++----------
 .../azure-pipelines/templates/c-api-cpu.yml   | 26 ------
 .../azure-pipelines/templates/py-linux.yml    |  4 +
 4 files changed, 49 insertions(+), 132 deletions(-)

diff --git a/.pipelines/OneBranch.Nuget-WindowsAI-Pipeline.Official.yml b/.pipelines/OneBranch.Nuget-WindowsAI-Pipeline.Official.yml
index b9de1b79e1d51..67f9d8b0ce392 100644
--- a/.pipelines/OneBranch.Nuget-WindowsAI-Pipeline.Official.yml
+++ b/.pipelines/OneBranch.Nuget-WindowsAI-Pipeline.Official.yml
@@ -53,10 +53,6 @@ extends:
           BuildArch: x86
           PythonPackageName: pythonx86
 
-      - template: .pipelines/windowsai-steps.yml@self
-        parameters:
-          BuildArch: arm
-
       - template: .pipelines/windowsai-steps.yml@self
         parameters:
           BuildArch: arm64
@@ -72,11 +68,6 @@ extends:
           PythonPackageName: pythonx86
           Runtime: static
 
-      - template: .pipelines/windowsai-steps.yml@self
-        parameters:
-          BuildArch: arm
-          Runtime: static
-
       - template: .pipelines/windowsai-steps.yml@self
         parameters:
           BuildArch: arm64
@@ -94,11 +85,9 @@ extends:
         dependsOn:
         - Windows_Packaging_x64_dynamic
         - Windows_Packaging_x86_dynamic
-        - Windows_Packaging_arm_dynamic
         - Windows_Packaging_arm64_dynamic
         - Windows_Packaging_x64_static
         - Windows_Packaging_x86_static
-        - Windows_Packaging_arm_static
         - Windows_Packaging_arm64_static
         condition: succeeded()
         steps:
@@ -120,12 +109,6 @@ extends:
             artifactName: 'drop_Windows_Build_Windows_Packaging_arm64_dynamic'
             targetPath: '$(Build.BinariesDirectory)/nuget-artifact-arm64'
 
-        - task: DownloadPipelineArtifact@0
-          displayName: 'Download Pipeline Artifact - NuGet DirectML arm'
-          inputs:
-            artifactName: 'drop_Windows_Build_Windows_Packaging_arm_dynamic'
-            targetPath: '$(Build.BinariesDirectory)/nuget-artifact-arm'
-
         - task: DownloadPipelineArtifact@0
           displayName: 'Download Pipeline Artifact - NuGet DirectML x64 StaticRuntime'
           inputs:
@@ -144,12 +127,6 @@ extends:
             artifactName: 'drop_Windows_Build_Windows_Packaging_arm64_static'
             targetPath: '$(Build.BinariesDirectory)/nuget-artifact-arm64-static-runtime'
 
-        - task: DownloadPipelineArtifact@0
-          displayName: 'Download Pipeline Artifact - NuGet DirectML arm StaticRuntime'
-          inputs:
-            artifactName: 'drop_Windows_Build_Windows_Packaging_arm_static'
-            targetPath: '$(Build.BinariesDirectory)/nuget-artifact-arm-static-runtime'
-
         - task: PowerShell@2
           displayName: 'Bundle NuGet and other binaries'
           inputs:
@@ -194,17 +171,7 @@ extends:
               $arm64_static_runtime_nupkg_unzipped_directory = [System.IO.Path]::Combine($arm64_static_runtime_nupkg_unzipped_directory_root, 'binaries', [System.IO.Path]::GetFileNameWithoutExtension($arm64_static_runtime_nuget_package))
               [System.IO.Compression.ZipFile]::ExtractToDirectory($arm64_static_runtime_nuget_package, $arm64_static_runtime_nupkg_unzipped_directory)
 
-              $nupkgs = (Get-ChildItem ..\nuget-artifact-arm -Filter Microsoft.AI.MachineLearning*.nupkg -Recurse)
-              $arm_nuget_package = $nupkgs[0].FullName
-              $arm_nupkg_unzipped_directory_root = $nupkgs[0].Directory.FullName
-              $arm_nupkg_unzipped_directory = [System.IO.Path]::Combine($arm_nupkg_unzipped_directory_root, 'binaries', [System.IO.Path]::GetFileNameWithoutExtension($arm_nuget_package))
-              [System.IO.Compression.ZipFile]::ExtractToDirectory($arm_nuget_package, $arm_nupkg_unzipped_directory)
-
-              $nupkgs = (Get-ChildItem ..\nuget-artifact-arm-static-runtime -Filter Microsoft.AI.MachineLearning*.nupkg -Recurse)
-              $arm_static_runtime_nuget_package = $nupkgs[0].FullName
-              $arm_static_runtime_nupkg_unzipped_directory_root = $nupkgs[0].Directory.FullName
-              $arm_static_runtime_nupkg_unzipped_directory = [System.IO.Path]::Combine($arm_static_runtime_nupkg_unzipped_directory_root, 'binaries', [System.IO.Path]::GetFileNameWithoutExtension($arm_static_runtime_nuget_package))
-              [System.IO.Compression.ZipFile]::ExtractToDirectory($arm_static_runtime_nuget_package, $arm_static_runtime_nupkg_unzipped_directory)
+             
 
               $x64_static_runtime_path_old = [System.IO.Path]::Combine($x64_static_runtime_nupkg_unzipped_directory, 'runtimes', 'win-x64', '_native')
               $x64_static_runtime_path_new = [System.IO.Path]::Combine($x64_nupkg_unzipped_directory, 'runtimes', 'win-x64', '_native', 'static')
@@ -216,10 +183,7 @@ extends:
               $arm64_runtime_path_new = [System.IO.Path]::Combine($x64_nupkg_unzipped_directory, 'runtimes', 'win-arm64', '_native')
               $arm64_static_runtime_path_old = [System.IO.Path]::Combine($arm64_static_runtime_nupkg_unzipped_directory, 'runtimes', 'win-arm64', '_native')
               $arm64_static_runtime_path_new = [System.IO.Path]::Combine($x64_nupkg_unzipped_directory, 'runtimes', 'win-arm64', '_native', 'static')
-              $arm_runtime_path_old = [System.IO.Path]::Combine($arm_nupkg_unzipped_directory, 'runtimes', 'win-arm', '_native')
-              $arm_runtime_path_new = [System.IO.Path]::Combine($x64_nupkg_unzipped_directory, 'runtimes', 'win-arm', '_native')
-              $arm_static_runtime_path_old = [System.IO.Path]::Combine($arm_static_runtime_nupkg_unzipped_directory, 'runtimes', 'win-arm', '_native')
-              $arm_static_runtime_path_new = [System.IO.Path]::Combine($x64_nupkg_unzipped_directory, 'runtimes', 'win-arm', '_native', 'static')
+              
               $uap_build_path_old = [System.IO.Path]::Combine($x64_static_runtime_nupkg_unzipped_directory, 'build', 'native')
               $uap_build_path_new = [System.IO.Path]::Combine($x64_nupkg_unzipped_directory, 'build', 'uap10.0')
 
@@ -228,8 +192,6 @@ extends:
               New-Item -Path $x86_static_runtime_path_new -ItemType Directory
               New-Item -Path $arm64_runtime_path_new -ItemType Directory
               New-Item -Path $arm64_static_runtime_path_new -ItemType Directory
-              New-Item -Path $arm_runtime_path_new -ItemType Directory
-              New-Item -Path $arm_static_runtime_path_new -ItemType Directory
 
               Copy-Item ([System.IO.Path]::Combine($x86_runtime_path_old, 'onnxruntime.dll'))                    $x86_runtime_path_new
               Copy-Item ([System.IO.Path]::Combine($x86_runtime_path_old, 'onnxruntime.lib'))                    $x86_runtime_path_new
@@ -241,11 +203,6 @@ extends:
               Copy-Item ([System.IO.Path]::Combine($arm64_runtime_path_old, 'microsoft.ai.machinelearning.dll')) $arm64_runtime_path_new
               Copy-Item ([System.IO.Path]::Combine($arm64_runtime_path_old, 'microsoft.ai.machinelearning.lib')) $arm64_runtime_path_new
 
-              Copy-Item ([System.IO.Path]::Combine($arm_runtime_path_old, 'onnxruntime.dll'))                    $arm_runtime_path_new
-              Copy-Item ([System.IO.Path]::Combine($arm_runtime_path_old, 'onnxruntime.lib'))                    $arm_runtime_path_new
-              Copy-Item ([System.IO.Path]::Combine($arm_runtime_path_old, 'microsoft.ai.machinelearning.dll'))   $arm_runtime_path_new
-              Copy-Item ([System.IO.Path]::Combine($arm_runtime_path_old, 'microsoft.ai.machinelearning.lib'))   $arm_runtime_path_new
-
               Copy-Item ([System.IO.Path]::Combine($x64_static_runtime_path_old, 'onnxruntime.dll'))                    ([System.IO.Path]::Combine($x64_static_runtime_path_new, 'onnxruntime.dll'))
               Copy-Item ([System.IO.Path]::Combine($x64_static_runtime_path_old, 'onnxruntime.lib'))                    ([System.IO.Path]::Combine($x64_static_runtime_path_new, 'onnxruntime.lib'))
               Copy-Item ([System.IO.Path]::Combine($x64_static_runtime_path_old, 'microsoft.ai.machinelearning.dll'))   ([System.IO.Path]::Combine($x64_static_runtime_path_new, 'microsoft.ai.machinelearning.dll'))
@@ -261,11 +218,6 @@ extends:
               Copy-Item ([System.IO.Path]::Combine($arm64_static_runtime_path_old, 'microsoft.ai.machinelearning.dll')) ([System.IO.Path]::Combine($arm64_static_runtime_path_new, 'microsoft.ai.machinelearning.dll'))
               Copy-Item ([System.IO.Path]::Combine($arm64_static_runtime_path_old, 'microsoft.ai.machinelearning.lib')) ([System.IO.Path]::Combine($arm64_static_runtime_path_new, 'microsoft.ai.machinelearning.lib'))
 
-              Copy-Item ([System.IO.Path]::Combine($arm_static_runtime_path_old, 'onnxruntime.dll'))                    ([System.IO.Path]::Combine($arm_static_runtime_path_new, 'onnxruntime.dll'))
-              Copy-Item ([System.IO.Path]::Combine($arm_static_runtime_path_old, 'onnxruntime.lib'))                    ([System.IO.Path]::Combine($arm_static_runtime_path_new, 'onnxruntime.lib'))
-              Copy-Item ([System.IO.Path]::Combine($arm_static_runtime_path_old, 'microsoft.ai.machinelearning.dll'))   ([System.IO.Path]::Combine($arm_static_runtime_path_new, 'microsoft.ai.machinelearning.dll'))
-              Copy-Item ([System.IO.Path]::Combine($arm_static_runtime_path_old, 'microsoft.ai.machinelearning.lib'))   ([System.IO.Path]::Combine($arm_static_runtime_path_new, 'microsoft.ai.machinelearning.lib'))
-
               Copy-Item -Recurse $uap_build_path_old $uap_build_path_new
 
               $merged_nuget_path = [System.IO.Path]::Combine($Env:BUILD_ARTIFACTSTAGINGDIRECTORY, 'merged')
@@ -304,22 +256,13 @@ extends:
               $arm64_nupkg_unzipped_directory = [System.IO.Path]::Combine($arm64_nupkg_unzipped_directory_root, 'symbols', [System.IO.Path]::GetFileNameWithoutExtension($arm64_nuget_package))
               [System.IO.Compression.ZipFile]::ExtractToDirectory($arm64_nuget_package, $arm64_nupkg_unzipped_directory)
 
-              $nupkgs = (Get-ChildItem ..\nuget-artifact-arm -Filter Microsoft.AI.MachineLearning*.snupkg -Recurse)
-              $arm_nuget_package = $nupkgs[0].FullName
-              $arm_nupkg_unzipped_directory_root = $nupkgs[0].Directory.FullName
-              $arm_nupkg_unzipped_directory = [System.IO.Path]::Combine($arm_nupkg_unzipped_directory_root, 'symbols', [System.IO.Path]::GetFileNameWithoutExtension($arm_nuget_package))
-              [System.IO.Compression.ZipFile]::ExtractToDirectory($arm_nuget_package, $arm_nupkg_unzipped_directory)
-
               $x86_runtime_path_old = [System.IO.Path]::Combine($x86_nupkg_unzipped_directory, 'runtimes', 'win-x86', '_native')
               $x86_runtime_path_new = [System.IO.Path]::Combine($x64_nupkg_unzipped_directory, 'runtimes', 'win-x86', '_native')
               $arm64_runtime_path_old = [System.IO.Path]::Combine($arm64_nupkg_unzipped_directory, 'runtimes', 'win-arm64', '_native')
               $arm64_runtime_path_new = [System.IO.Path]::Combine($x64_nupkg_unzipped_directory, 'runtimes', 'win-arm64', '_native')
-              $arm_runtime_path_old = [System.IO.Path]::Combine($arm_nupkg_unzipped_directory, 'runtimes', 'win-arm', '_native')
-              $arm_runtime_path_new = [System.IO.Path]::Combine($x64_nupkg_unzipped_directory, 'runtimes', 'win-arm', '_native')
-
+              
               New-Item -Path $x86_runtime_path_new -ItemType Directory
               New-Item -Path $arm64_runtime_path_new -ItemType Directory
-              New-Item -Path $arm_runtime_path_new -ItemType Directory
 
               Copy-Item ([System.IO.Path]::Combine($x86_runtime_path_old, 'onnxruntime.pdb'))                    $x86_runtime_path_new
               Copy-Item ([System.IO.Path]::Combine($x86_runtime_path_old, 'microsoft.ai.machinelearning.pdb'))   $x86_runtime_path_new
@@ -327,9 +270,6 @@ extends:
               Copy-Item ([System.IO.Path]::Combine($arm64_runtime_path_old, 'onnxruntime.pdb'))                  $arm64_runtime_path_new
               Copy-Item ([System.IO.Path]::Combine($arm64_runtime_path_old, 'microsoft.ai.machinelearning.pdb')) $arm64_runtime_path_new
 
-              Copy-Item ([System.IO.Path]::Combine($arm_runtime_path_old, 'onnxruntime.pdb'))                    $arm_runtime_path_new
-              Copy-Item ([System.IO.Path]::Combine($arm_runtime_path_old, 'microsoft.ai.machinelearning.pdb'))   $arm_runtime_path_new
-
               $merged_nuget_path = [System.IO.Path]::Combine($Env:BUILD_ARTIFACTSTAGINGDIRECTORY, 'merged')
               if (!(Test-Path $merged_nuget_path)) {
                   New-Item -Path $merged_nuget_path -ItemType Directory
diff --git a/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml b/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml
index badee79fd78b3..172a0dc1866ab 100644
--- a/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml
+++ b/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml
@@ -92,6 +92,9 @@ stages:
       vmImage: ubuntu-latest
     steps:
     - checkout: none
+    - task: mspremier.PostBuildCleanup.PostBuildCleanup-task.PostBuildCleanup@3
+      displayName: 'Clean Agent Directories'
+      condition: always()
     - bash: |
         # Do not output ##vso[] commands with `set -x` or they may be parsed again and include a trailing quote.
         set +x
@@ -105,6 +108,10 @@ stages:
           echo "##vso[task.setvariable variable=ReleaseVersionSuffix;isOutput=true]"
         fi
       name: Set_Release_Version_Suffix
+    - template: templates/component-governance-component-detection-steps.yml
+      parameters :
+        condition : 'succeeded'
+
 
 - stage: Debug
   dependsOn: Setup
@@ -116,7 +123,14 @@ stages:
       MyVar: $[stageDependencies.Setup.Set_Variables.outputs['Set_Release_Version_Suffix.ReleaseVersionSuffix']]
     steps:
     - checkout: none
+    - task: mspremier.PostBuildCleanup.PostBuildCleanup-task.PostBuildCleanup@3
+      displayName: 'Clean Agent Directories'
+      condition: always()
     - bash: echo $(MyVar)
+    - template: templates/component-governance-component-detection-steps.yml
+      parameters :
+        condition : 'succeeded'
+
 
 - stage: Download_Java_Tools
   dependsOn: []
@@ -126,6 +140,9 @@ stages:
       vmImage: ubuntu-latest
     steps:
     - checkout: none
+    - task: mspremier.PostBuildCleanup.PostBuildCleanup-task.PostBuildCleanup@3
+      displayName: 'Clean Agent Directories'
+      condition: always()
     - task: CmdLine@2
       displayName: Download Java Tools
       inputs:
@@ -141,6 +158,9 @@ stages:
       inputs:
         targetPath: '$(Agent.TempDirectory)/java-tools'
         artifact: 'onnxruntime-java-tools'
+    - template: templates/component-governance-component-detection-steps.yml
+      parameters :
+        condition : 'succeeded'
 
 - template: templates/c-api-cpu.yml
   parameters:
@@ -525,6 +545,9 @@ stages:
       submodules: false
     - checkout: manylinux                      # due to checkout multiple repos, the root directory is $(Build.SourcesDirectory)/manylinux
       submodules: false
+    - task: mspremier.PostBuildCleanup.PostBuildCleanup-task.PostBuildCleanup@3
+      displayName: 'Clean Agent Directories'
+      condition: always()
 
     - script: |
         set -e -x
@@ -603,6 +626,10 @@ stages:
       inputs:
         targetPath: '$(Build.ArtifactStagingDirectory)/onnxruntime-linux-x64-gpu-$(OnnxRuntimeVersion).tgz'
         artifactName: 'onnxruntime-linux-x64-gpu'
+    - template: templates/component-governance-component-detection-steps.yml
+      parameters :
+        condition : 'succeeded'
+
 
 - stage: Windows_Packaging_combined_GPU
   dependsOn:
@@ -619,6 +646,10 @@ stages:
     - checkout: self                           # due to checkout multiple repos, the root directory is $(Build.SourcesDirectory)/onnxruntime
     - checkout: onnxruntime-inference-examples # due to checkout multiple repos, the root directory is $(Build.SourcesDirectory)/onnxruntime-inference-examples
       submodules: false
+    - task: mspremier.PostBuildCleanup.PostBuildCleanup-task.PostBuildCleanup@3
+      displayName: 'Clean Agent Directories'
+      condition: always()
+
     - script: dir $(Build.SourcesDirectory)
     - task: BatchScript@1
       displayName: 'setup env'
@@ -688,7 +719,9 @@ stages:
       inputs:
         artifactName: 'onnxruntime-win-x64-gpu'
         targetPath: '$(Build.ArtifactStagingDirectory)'
-
+    - template: templates/component-governance-component-detection-steps.yml
+      parameters :
+        condition : 'succeeded'
 
 - stage: NuGet_Packaging_GPU
   dependsOn:
@@ -1246,45 +1279,21 @@ stages:
      mkdir $(Build.ArtifactStagingDirectory)\testdata
      copy $(Build.BinariesDirectory)\RelWithDebInfo\RelWithDebInfo\custom_op_library.* $(Build.ArtifactStagingDirectory)\testdata
 
-- template: nuget/templates/dml-vs-2022.yml
-  parameters:
-    AgentPool : 'onnxruntime-Win-CPU-2022'
-    IsReleaseBuild: ${{ parameters.IsReleaseBuild }}
-    ArtifactName: 'drop-win-dml-arm-zip'
-    StageName: 'Windows_CI_GPU_DML_Dev_arm'
-    BuildCommand: --build_dir $(Build.BinariesDirectory) --arm --skip_submodule_sync --build_shared_lib --enable_onnx_tests --enable_wcos --use_telemetry --use_dml --use_winml --cmake_generator "Visual Studio 17 2022"
-    BuildArch: 'x64'
-    EnvSetupScript: 'setup_env.bat'
-    sln_platform: 'arm'
-    DoDebugBuild: 'false'
-    DoNugetPack : 'true'
-    DoCompliance: ${{ parameters.DoCompliance }}
-    DoEsrp: ${{ parameters.DoEsrp }}
-    RunTests: 'false'
-    BuildNodejs: 'false'
-    NuPackScript: |
-     msbuild $(Build.SourcesDirectory)\csharp\OnnxRuntime.CSharp.proj /p:Configuration=RelWithDebInfo /p:TargetArchitecture=arm /t:CreatePackage /p:OrtPackageId=Microsoft.ML.OnnxRuntime.DirectML /p:IsReleaseBuild=${{ parameters.IsReleaseBuild }}
-     cd $(Build.BinariesDirectory)\RelWithDebInfo\RelWithDebInfo\
-     ren Microsoft.ML.OnnxRuntime.DirectML.* win-dml-arm.zip
-     copy $(Build.BinariesDirectory)\RelWithDebInfo\RelWithDebInfo\win-dml-arm.zip $(Build.ArtifactStagingDirectory)
-     mkdir $(Build.ArtifactStagingDirectory)\testdata
-     copy $(Build.BinariesDirectory)\RelWithDebInfo\RelWithDebInfo\custom_op_library.* $(Build.ArtifactStagingDirectory)\testdata
-
 - stage: NuGet_Packaging_DML
   dependsOn:
   - Windows_CI_GPU_DML_Dev
   - Windows_CI_GPU_DML_Dev_x86
   - Windows_CI_GPU_DML_Dev_arm64
-  - Windows_CI_GPU_DML_Dev_arm
   condition: succeeded()
   jobs:
   - job:
     workspace:
       clean: all
-    pool: 'onnxruntime-Win2022-GPU-T4'
-
+    pool: 'onnxruntime-Win2022-GPU-dml-A10'
     steps:
-
+    - task: mspremier.PostBuildCleanup.PostBuildCleanup-task.PostBuildCleanup@3
+      displayName: 'Clean Agent Directories'
+      condition: always()
     - task: DownloadPipelineArtifact@0
       displayName: 'Download Pipeline Artifact - NuGet DirectML'
       inputs:
@@ -1303,12 +1312,6 @@ stages:
         artifactName: 'drop-win-dml-arm64-zip'
         targetPath: '$(Build.BinariesDirectory)/nuget-artifact-dml'
 
-    - task: DownloadPipelineArtifact@0
-      displayName: 'Download Pipeline Artifact - NuGet DirectML arm'
-      inputs:
-        artifactName: 'drop-win-dml-arm-zip'
-        targetPath: '$(Build.BinariesDirectory)/nuget-artifact-dml'
-
     - script: |
         pushd $(Build.BinariesDirectory)\nuget-artifact-dml
         dir
@@ -1339,13 +1342,6 @@ stages:
                 move win-arm64\runtimes\win-arm64\native\onnxruntime.lib %%~ni\runtimes\win-arm64\native\onnxruntime.lib
                 move win-arm64\runtimes\win-arm64\native\onnxruntime.pdb %%~ni\runtimes\win-arm64\native\onnxruntime.pdb
 
-                unzip win-dml-arm.zip -d win-arm
-                mkdir %%~ni\runtimes\win-arm
-                mkdir %%~ni\runtimes\win-arm\native
-
-                move win-arm\runtimes\win-arm\native\onnxruntime.dll %%~ni\runtimes\win-arm\native\onnxruntime.dll
-                move win-arm\runtimes\win-arm\native\onnxruntime.lib %%~ni\runtimes\win-arm\native\onnxruntime.lib
-                move win-arm\runtimes\win-arm\native\onnxruntime.pdb %%~ni\runtimes\win-arm\native\onnxruntime.pdb
 
                 pushd %%~ni
                 zip -r ..\%%~ni.zip .
@@ -1368,7 +1364,7 @@ stages:
         PackageType: 'nuget'
         PackagePath: '$(Build.ArtifactStagingDirectory)'
         PackageName: 'Microsoft.ML.OnnxRuntime.DirectML*nupkg'
-        PlatformsSupported: 'win-x64,win-x86,win-arm64,win-arm'
+        PlatformsSupported: 'win-x64,win-x86,win-arm64'
         VerifyNugetSigning: ${{ parameters.DoEsrp }}
 
     - task: PublishPipelineArtifact@0
@@ -1376,3 +1372,6 @@ stages:
       inputs:
         artifactName: 'drop-signed-nuget-dml'
         targetPath: '$(Build.ArtifactStagingDirectory)'
+    - template: templates/component-governance-component-detection-steps.yml
+      parameters :
+        condition : 'succeeded'
diff --git a/tools/ci_build/github/azure-pipelines/templates/c-api-cpu.yml b/tools/ci_build/github/azure-pipelines/templates/c-api-cpu.yml
index 37b4bdc43afcd..e6025ae1b56bd 100644
--- a/tools/ci_build/github/azure-pipelines/templates/c-api-cpu.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/c-api-cpu.yml
@@ -161,20 +161,6 @@ stages:
     buildJava: false
     buildNodejs: false
 
-- template: win-ci.yml
-  parameters:
-    DoCompliance: ${{ parameters.DoCompliance }}
-    DoEsrp: ${{ parameters.DoEsrp }}
-    stage_name_suffix: CPU_arm_${{ parameters.BuildVariant }}
-    buildArch: x64
-    msbuildPlatform: arm
-    packageName: arm
-    buildparameter: --arm ${{ parameters.AdditionalBuildFlags }}  ${{ parameters.AdditionalWinBuildFlags}} --path_to_protoc_exe $(Build.BinariesDirectory)\RelWithDebInfo\installed\bin\protoc.exe
-    runTests: false
-    buildJava: false
-    buildNodejs: false
-    ort_build_pool_name: onnxruntime-Win-CPU-2022
-
 - template: win-ci.yml
   parameters:
     DoCompliance: ${{ parameters.DoCompliance }}
@@ -205,10 +191,7 @@ stages:
   dependsOn:
   - Linux_C_API_Packaging_CPU
   - MacOS_C_API_Package_Publish
-  - Windows_Packaging_CPU_x86_${{ parameters.BuildVariant }}
   - Windows_Packaging_CPU_x64_${{ parameters.BuildVariant }}
-  - Windows_Packaging_CPU_arm_${{ parameters.BuildVariant }}
-  - Windows_Packaging_CPU_arm64_${{ parameters.BuildVariant }}
   - Download_Java_Tools
   condition: succeeded()
   jobs:
@@ -297,7 +280,6 @@ stages:
   - MacOS_C_API_Package_Publish
   - Windows_Packaging_CPU_x86_${{ parameters.BuildVariant }}
   - Windows_Packaging_CPU_x64_${{ parameters.BuildVariant }}
-  - Windows_Packaging_CPU_arm_${{ parameters.BuildVariant }}
   - Windows_Packaging_CPU_arm64_${{ parameters.BuildVariant }}
   - Android_Java_API_AAR_Packaging_Full
   - iOS_Full_xcframework
@@ -340,14 +322,6 @@ stages:
         SpecificArtifact: ${{ parameters.specificArtifact }}
         BuildId: ${{ parameters.BuildId }}
 
-    - template: flex-downloadPipelineArtifact.yml
-      parameters:
-        StepName: 'Download win-arm Pipeline Artifact'
-        ArtifactName: 'onnxruntime-win-arm'
-        TargetPath: '$(Build.BinariesDirectory)/nuget-artifact'
-        SpecificArtifact: ${{ parameters.specificArtifact }}
-        BuildId: ${{ parameters.BuildId }}
-
     - template: flex-downloadPipelineArtifact.yml
       parameters:
         StepName: 'Download osx-x64 Pipeline Artifact'
diff --git a/tools/ci_build/github/azure-pipelines/templates/py-linux.yml b/tools/ci_build/github/azure-pipelines/templates/py-linux.yml
index db3782c69cf62..2adcbb13dbeb8 100644
--- a/tools/ci_build/github/azure-pipelines/templates/py-linux.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/py-linux.yml
@@ -106,3 +106,7 @@ jobs:
       inputs:
         artifactName: 'drop-linux-cpu-${{ parameters.arch }}'
         targetPath: '$(Build.BinariesDirectory)/${{ parameters.cmake_build_type }}'
+
+    - template: component-governance-component-detection-steps.yml
+      parameters :
+        condition : 'succeeded'
\ No newline at end of file

From dee6a5b3715c5bdf7a6d29c2b9516902ebd0e0b1 Mon Sep 17 00:00:00 2001
From: Xu Xing <xing.xu@intel.com>
Date: Tue, 9 Jan 2024 23:46:30 +0800
Subject: [PATCH 09/29] [js/webgpu] Support uniforms for attention and
 multihead attention (#18903)

---
 .../lib/wasm/jsep/webgpu/op-resolve-rules.ts  |   4 +-
 js/web/lib/wasm/jsep/webgpu/ops/attention.ts  | 330 +++++++++---------
 js/web/lib/wasm/jsep/webgpu/ops/common.ts     |   4 +-
 .../jsep/webgpu/ops/multi-head-attentiion.ts  |  38 +-
 4 files changed, 190 insertions(+), 186 deletions(-)

diff --git a/js/web/lib/wasm/jsep/webgpu/op-resolve-rules.ts b/js/web/lib/wasm/jsep/webgpu/op-resolve-rules.ts
index 06c3c6c196501..c182d3c4eaf6f 100644
--- a/js/web/lib/wasm/jsep/webgpu/op-resolve-rules.ts
+++ b/js/web/lib/wasm/jsep/webgpu/op-resolve-rules.ts
@@ -2,7 +2,7 @@
 // Licensed under the MIT License.
 
 import {argMax, argMin, parseArgMinMaxAttributes} from './ops/argminmax';
-import {attention, parseAttentionAttributes} from './ops/attention';
+import {attention} from './ops/attention';
 import {batchNorm} from './ops/batch-norm';
 import {biasAdd} from './ops/bias-add';
 import {biasSplitGelu} from './ops/bias-split-gelu';
@@ -50,7 +50,7 @@ export const WEBGPU_OP_RESOLVE_RULES: Map<string, OperatorImplementation> = new
   ['Asinh', [unaryOps.asinh]],
   ['Atan', [unaryOps.atan]],
   ['Atanh', [unaryOps.atanh]],
-  ['Attention', [attention, parseAttentionAttributes]],
+  ['Attention', [attention]],
   // TODO: support new attributes for AveragePool-10
   ['AveragePool', [pool.averagePool, pool.parseAveragePoolAttributes]],
   ['BatchNormalization', [batchNorm]],
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/attention.ts b/js/web/lib/wasm/jsep/webgpu/ops/attention.ts
index e1f2a47301bfb..ef8038dff487e 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/attention.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/attention.ts
@@ -1,11 +1,11 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
+import {tensorDataTypeEnumToString} from '../../../wasm-common';
 import {TensorView} from '../../tensor-view';
-import {createAttributeWithCacheKey} from '../attribute-with-cache-key';
-import {ComputeContext, GpuDataType} from '../types';
+import {ComputeContext, GpuDataType, ProgramUniform} from '../types';
 
-import {castToF32, fillVector, getMaxComponents, inputVariable, outputVariable, ShaderHelper, sumVector, tensorTypeToWsglStorageType} from './common';
+import {castToF32, fillVector, getMaxComponents, inputVariable, outputVariable, ShaderHelper, sumVector, tensorTypeToWsglStorageType, tensorTypeToWsglValueType, UniformDataElementType, UniformsArrayType} from './common';
 
 export const enum AttentionQkvFormat {
   unknown,          // enum value not set, or depends on qkv projection implementation details
@@ -231,20 +231,8 @@ const validateAttentionInputs = (inputs: readonly TensorView[], attributes: Atte
   };
 };
 
-export const parseAttentionAttributes = (attributes: AttentionAttrs): AttentionAttrs =>
-    createAttributeWithCacheKey({...attributes});
-
 export const computeInPlaceSoftmax = (context: ComputeContext, input: TensorView, n: number, d: number) => {
   const components = getMaxComponents(d);
-  const inputHelper = outputVariable('x', input.dataType, input.dims, components);
-
-  let threadMaxValue = 'threadMaxVector';
-  if (components === 2) {
-    threadMaxValue = 'max(threadMaxVector.x, threadMaxVector.y)';
-  } else if (components === 4) {
-    threadMaxValue = 'max(max(threadMaxVector.x, threadMaxVector.y), max(threadMaxVector.z, threadMaxVector.w))';
-  }
-  const dataType = tensorTypeToWsglStorageType(input.dataType);
   let WG = 64;
   const dComp = d / components;
   if (dComp < WG) {
@@ -253,25 +241,41 @@ export const computeInPlaceSoftmax = (context: ComputeContext, input: TensorView
     WG = Math.ceil(dComp / 8);
   }
   const elementsPerWG = Math.ceil(d / components / WG);
+  const tensorDataType = tensorDataTypeEnumToString(input.dataType) as ProgramUniform['type'];
+  const programUniforms: ProgramUniform[] =
+      [{type: tensorDataType, data: 1 / d}, {type: 'uint32', data: dComp}, {type: 'uint32', data: elementsPerWG}];
+  const dataType = tensorTypeToWsglStorageType(input.dataType, components);
+
+  const getShaderSource = (shaderHelper: ShaderHelper) => {
+    const inputHelper = outputVariable('x', input.dataType, input.dims, components);
+    let threadMaxValue = 'thread_max_vector';
+    if (components === 2) {
+      threadMaxValue = 'max(thread_max_vector.x, thread_max_vector.y)';
+    } else if (components === 4) {
+      threadMaxValue =
+          'max(max(thread_max_vector.x, thread_max_vector.y), max(thread_max_vector.z, thread_max_vector.w))';
+    }
+    const elemValueType = tensorTypeToWsglValueType(input.dataType);
+    const uniforms: UniformsArrayType = [
+      {name: 'd_inv', type: elemValueType as UniformDataElementType}, {name: 'd_comp', type: 'u32'},
+      {name: 'elements_per_wg', type: 'u32'}
+    ];
 
-  const getShaderSource = (shaderHelper: ShaderHelper) => `
-  const dInv: ${dataType} = 1 / ${d};
-  const dComp = ${d / components};
+    return `
   var<workgroup> wgMax: array<f32, ${WG}>;
   var<workgroup> wgSum: array<f32, ${WG}>;
-
-  ${shaderHelper.declareVariables(inputHelper)}
-  @compute @workgroup_size(${WG}, 1, 1)
-  fn main(@builtin(workgroup_id) workgroup_id : vec3<u32>,
-    @builtin(local_invocation_index) local_index : u32) {
-    let localOffset = local_index * ${elementsPerWG};
-    let offset: u32 = workgroup_id.x * dComp + localOffset;
-
-    var threadMaxVector = ${fillVector('f32', components, '-3.402823e+38f')};
-    for (var i: u32 = 0; i < ${elementsPerWG} && i + localOffset < dComp; i++) {
-      threadMaxVector = max(${castToF32(dataType, components, 'x[offset + i]')}, threadMaxVector);
+  ${shaderHelper.registerUniforms(uniforms).declareVariables(inputHelper)}
+  ${shaderHelper.mainStart([
+      WG, 1, 1
+    ])}
+    let localOffset = local_idx * uniforms.elements_per_wg;
+    let offset: u32 = workgroup_id.x * uniforms.d_comp + localOffset;
+
+    var thread_max_vector = ${fillVector('f32', components, '-3.402823e+38f')};
+    for (var i: u32 = 0; i < uniforms.elements_per_wg && i + localOffset < uniforms.d_comp; i++) {
+      thread_max_vector = max(${castToF32(elemValueType, components, 'x[offset + i]')}, thread_max_vector);
     }
-    wgMax[local_index] = ${threadMaxValue};
+    wgMax[local_idx] = ${threadMaxValue};
     workgroupBarrier();
 
     var maxValue = -3.402823e+38f;
@@ -280,10 +284,10 @@ export const computeInPlaceSoftmax = (context: ComputeContext, input: TensorView
     }
 
     var sumVector = ${fillVector('f32', components, '0')};
-    for (var i: u32 = 0; i < ${elementsPerWG} && i + localOffset < dComp; i++) {
-      sumVector += exp(${castToF32(dataType, components, 'x[offset + i]')} - maxValue);
+    for (var i: u32 = 0; i < uniforms.elements_per_wg && i + localOffset < uniforms.d_comp; i++) {
+      sumVector += exp(${castToF32(elemValueType, components, 'x[offset + i]')} - maxValue);
     }
-    wgSum[local_index] = ${sumVector('sumVector', components)};
+    wgSum[local_idx] = ${sumVector('sumVector', components)};
     workgroupBarrier();
 
     var sum: f32 = 0;
@@ -292,26 +296,24 @@ export const computeInPlaceSoftmax = (context: ComputeContext, input: TensorView
     }
 
     if (sum == 0) {
-      for (var i: u32 = 0; i < ${elementsPerWG} && i + localOffset < dComp; i++) {
-        x[offset + i] = ${fillVector(dataType, components, 'dInv')};
+      for (var i: u32 = 0; i < uniforms.elements_per_wg && i + localOffset < uniforms.d_comp; i++) {
+        x[offset + i] = ${fillVector('f32', components, 'uniforms.d_inv')};
       }
     } else {
-      for (var i: u32 = 0; i < ${elementsPerWG} && i + localOffset < dComp; i++) {
-        let f32input = ${castToF32(dataType, components, 'x[offset + i]')};
+      for (var i: u32 = 0; i < uniforms.elements_per_wg && i + localOffset < uniforms.d_comp; i++) {
+        let f32input = ${castToF32(elemValueType, components, 'x[offset + i]')};
         x[offset + i] = ${inputHelper.type.value}(exp(f32input - maxValue) / sum);
       }
     }
   }`;
+  };
 
   context.compute(
       {
         name: 'AttentionProbsSoftmax',
-        shaderCache: {hint: `${d}`},
+        shaderCache: {hint: `${WG};${dataType};${components}`},
         getShaderSource,
-        getRunData: () => ({
-          outputs: [],
-          dispatchGroup: {x: n},
-        }),
+        getRunData: () => ({outputs: [], dispatchGroup: {x: n}, programUniforms}),
       },
       {inputs: [input], outputs: []});
 };
@@ -326,47 +328,43 @@ const computeAttentionProbs =
       // TODO: handle mask
 
       const alpha = attributes.scale === 0 ? 1.0 / Math.sqrt(parameters.headSize) : attributes.scale;
-
-      const dataType = tensorTypeToWsglStorageType(q.dataType);
-
       const components = getMaxComponents(parameters.headSize);
-      const qInput = inputVariable('q', q.dataType, q.dims, components);
-      const kInput = inputVariable('key', key.dataType, key.dims, components);
-      const output = outputVariable('output', q.dataType, probsShape);
-
       const vectorizedHeadSize = parameters.headSize / components;
-      const M = parameters.sequenceLength;
-      const N = parameters.totalSequenceLength;
-      const K = vectorizedHeadSize;
-
       const TILE_SIZE = 12;
-
       const dispatch = {
         x: Math.ceil(parameters.totalSequenceLength / TILE_SIZE),
         y: Math.ceil(parameters.sequenceLength / TILE_SIZE),
         z: parameters.batchSize * parameters.numHeads
       };
+      const tensorDataType = tensorDataTypeEnumToString(q.dataType) as ProgramUniform['type'];
+      const programUniforms: ProgramUniform[] = [
+        {type: 'uint32', data: parameters.sequenceLength}, {type: 'uint32', data: vectorizedHeadSize},
+        {type: 'uint32', data: parameters.totalSequenceLength}, {type: 'uint32', data: parameters.kvSequenceLength},
+        {type: tensorDataType, data: alpha}
+      ];
 
       const inputs = [q, key];
-      const getShaderSource = (shaderHelper: ShaderHelper) => `
-  const M: u32 = ${M}u;
-  const N: u32 = ${N}u;
-  const K: u32 = ${K}u;
-  const alpha: ${dataType} = ${alpha};
+
+      const getShaderSource = (shaderHelper: ShaderHelper) => {
+        const qInput = inputVariable('q', q.dataType, q.dims, components);
+        const kInput = inputVariable('key', key.dataType, key.dims, components);
+        const output = outputVariable('output', q.dataType, probsShape);
+        const dataType = tensorTypeToWsglStorageType(q.dataType);
+
+        const uniforms: UniformsArrayType = [
+          {name: 'M', type: 'u32'}, {name: 'K', type: 'u32'}, {name: 'N', type: 'u32'},
+          {name: 'kv_sequence_length', type: 'u32'}, {name: 'alpha', type: dataType as UniformDataElementType}
+        ];
+        return `
   const beta: ${dataType} = 1.0;
   const TILE_SIZE = ${TILE_SIZE}u;
 
   var<workgroup> tileQ: array<${qInput.type.storage}, ${TILE_SIZE * TILE_SIZE}>;
   var<workgroup> tileK: array<${qInput.type.storage}, ${TILE_SIZE * TILE_SIZE}>;
-
-  ${shaderHelper.declareVariables(qInput, kInput, output)}
-
-  @compute @workgroup_size(${TILE_SIZE}, ${TILE_SIZE}, 1)
-  fn main(@builtin(workgroup_id) workgroup_id : vec3<u32>,
-   @builtin(local_invocation_id) local_id : vec3<u32>, @builtin(local_invocation_index) local_index : u32) {
-   let global_idx = (workgroup_id.z * ${dispatch.x * dispatch.y}u +
-          workgroup_id.y * ${dispatch.x}u + workgroup_id.x) * ${TILE_SIZE * TILE_SIZE}u + local_index;
-
+  ${shaderHelper.registerUniforms(uniforms).declareVariables(qInput, kInput, output)}
+  ${shaderHelper.mainStart([
+          TILE_SIZE, TILE_SIZE, 1
+        ])}
     // x holds the N and y holds the M
     let headIdx = workgroup_id.z;
     let m = workgroup_id.y * TILE_SIZE;
@@ -374,40 +372,42 @@ const computeAttentionProbs =
     let lm = m + local_id.y;
     let ln = n + local_id.x;
 
-    let qOffset = ${parameters.sequenceLength * vectorizedHeadSize} * headIdx + m * K;
-    let kOffset = ${parameters.kvSequenceLength * vectorizedHeadSize} * headIdx + n * K;
+    let qOffset = uniforms.M * uniforms.K * headIdx + m * uniforms.K;
+    let kOffset = uniforms.kv_sequence_length * uniforms.K * headIdx + n * uniforms.K;
 
     var value = ${fillVector(dataType, components)};
-    for (var w: u32 = 0u; w < K; w += TILE_SIZE) {
-      if (m + local_id.y < M && w + local_id.x < K) {
-        tileQ[TILE_SIZE * local_id.y + local_id.x] = q[qOffset + local_id.y * K + w + local_id.x];
+    for (var w: u32 = 0u; w < uniforms.K; w += TILE_SIZE) {
+      if (m + local_id.y < uniforms.M && w + local_id.x < uniforms.K) {
+        tileQ[TILE_SIZE * local_id.y + local_id.x] = q[qOffset + local_id.y * uniforms.K + w + local_id.x];
       }
-      if (n + local_id.y < N && w + local_id.x < K) {
-        tileK[TILE_SIZE * local_id.y + local_id.x] = key[kOffset + local_id.y * K + w + local_id.x];
+      if (n + local_id.y < uniforms.N && w + local_id.x < uniforms.K) {
+        tileK[TILE_SIZE * local_id.y + local_id.x] = key[kOffset + local_id.y * uniforms.K + w + local_id.x];
       }
       workgroupBarrier();
 
-      for (var k: u32 = 0u; k<TILE_SIZE && w+k < K; k++) {
+      for (var k: u32 = 0u; k<TILE_SIZE && w+k < uniforms.K; k++) {
         value += tileQ[TILE_SIZE * local_id.y + k] * tileK[TILE_SIZE * local_id.x + k];
       }
 
       workgroupBarrier();
     }
 
-    let headOffset = headIdx * M * N;
-    if (lm < M && ln < N) {
-      let outputIdx = headOffset + lm * N + ln;
-      output[outputIdx] = ${sumVector('value', components)} * alpha;
+    let headOffset = headIdx * uniforms.M * uniforms.N;
+    if (lm < uniforms.M && ln < uniforms.N) {
+      let outputIdx = headOffset + lm * uniforms.N + ln;
+      output[outputIdx] = ${sumVector('value', components)} * uniforms.alpha;
     }
   }`;
+      };
 
       const probs = context.compute(
           {
             name: 'AttentionProbs',
-            shaderCache: {hint: JSON.stringify(parameters)},
+            shaderCache: {hint: `${components}`, inputDependencies: ['type', 'type']},
             getRunData: () => ({
               outputs: [{dims: probsShape, dataType: q.dataType, gpuDataType: GpuDataType.default}],
               dispatchGroup: dispatch,
+              programUniforms
             }),
             getShaderSource,
           },
@@ -423,78 +423,76 @@ const computeAttentionProbs =
 const computeVxAttentionScore =
     (context: ComputeContext, probs: TensorView, v: TensorView, params: AttentionParameters) => {
       const outputShape = [params.batchSize, params.sequenceLength, params.vHiddenSize];
-
-      const probsHelper = inputVariable('probs', probs.dataType, probs.dims);
-      const vHelper = inputVariable('v', v.dataType, v.dims);
-      const output = outputVariable('output', probs.dataType, outputShape);
-
-      const dataType = tensorTypeToWsglStorageType(probs.dataType);
-
       const TILE_SIZE = 12;
       const dispatch = {
         x: Math.ceil(params.vHeadSize / TILE_SIZE),
         y: Math.ceil(params.sequenceLength / TILE_SIZE),
         z: params.batchSize * params.numHeads
       };
+      const programUniforms: ProgramUniform[] = [
+        {type: 'uint32', data: params.sequenceLength}, {type: 'uint32', data: params.totalSequenceLength},
+        {type: 'uint32', data: params.vHeadSize}, {type: 'uint32', data: params.numHeads},
+        {type: 'uint32', data: params.vHiddenSize}
+      ];
 
-      const getShaderSource = (shaderHelper: ShaderHelper) => `
-  const M: u32 = ${params.sequenceLength}u;
-  const N: u32 = ${params.vHeadSize}u;
-  const K: u32 = ${params.totalSequenceLength}u;
-  const numHeads: u32 = ${params.numHeads}u;
+      const getShaderSource = (shaderHelper: ShaderHelper) => {
+        const probsHelper = inputVariable('probs', probs.dataType, probs.dims);
+        const vHelper = inputVariable('v', v.dataType, v.dims);
+        const output = outputVariable('output', probs.dataType, outputShape);
+        const uniforms: UniformsArrayType = [
+          {name: 'M', type: 'u32'}, {name: 'K', type: 'u32'}, {name: 'N', type: 'u32'},
+          {name: 'num_heads', type: 'u32'}, {name: 'v_hidden_size', type: 'u32'}
+        ];
+        return `
   const TILE_SIZE = ${TILE_SIZE}u;
-
-  var<workgroup> tileQ: array<${probsHelper.type.storage}, ${TILE_SIZE * TILE_SIZE}>;
-  var<workgroup> tileK: array<${probsHelper.type.storage}, ${TILE_SIZE * TILE_SIZE}>;
-
-  ${shaderHelper.declareVariables(probsHelper, vHelper, output)}
-
-  @compute @workgroup_size(${TILE_SIZE}, ${TILE_SIZE}, 1)
-  fn main(@builtin(workgroup_id) workgroup_id : vec3<u32>,
-   @builtin(local_invocation_id) local_id : vec3<u32>, @builtin(local_invocation_index) local_index : u32) {
-   let global_idx = (workgroup_id.z * ${dispatch.x * dispatch.y}u +
-          workgroup_id.y * ${dispatch.x}u + workgroup_id.x) * ${TILE_SIZE * TILE_SIZE}u + local_index;
-
+  var<workgroup> tileQ: array<${probsHelper.type.value}, ${TILE_SIZE * TILE_SIZE}>;
+  var<workgroup> tileK: array<${probsHelper.type.value}, ${TILE_SIZE * TILE_SIZE}>;
+  ${shaderHelper.registerUniforms(uniforms).declareVariables(probsHelper, vHelper, output)}
+  ${shaderHelper.mainStart([
+          TILE_SIZE, TILE_SIZE, 1
+        ])}
    let headIdx = workgroup_id.z;
    let m = workgroup_id.y * TILE_SIZE + local_id.y;
    let n = workgroup_id.x * TILE_SIZE + local_id.x;
 
-   let offsetA = headIdx * (M * K) + m * K;
-   let offsetB = headIdx * (N * K) + n;
+   let offsetA = headIdx * (uniforms.M * uniforms.K) + m * uniforms.K;
+   let offsetB = headIdx * (uniforms.N * uniforms.K) + n;
 
-   var value = ${dataType}(0);
-   for (var w: u32 = 0u; w < K; w += TILE_SIZE) {
-     if (m < M && w + local_id.x < K) {
+   var value = ${probsHelper.type.storage}(0);
+   for (var w: u32 = 0u; w < uniforms.K; w += TILE_SIZE) {
+     if (m < uniforms.M && w + local_id.x < uniforms.K) {
        tileQ[TILE_SIZE * local_id.y + local_id.x] = probs[offsetA + w + local_id.x];
      }
-     if (n < N && w + local_id.y < K) {
-       tileK[TILE_SIZE * local_id.y + local_id.x] = v[offsetB + (w + local_id.y) * N];
+     if (n < uniforms.N && w + local_id.y < uniforms.K) {
+       tileK[TILE_SIZE * local_id.y + local_id.x] = v[offsetB + (w + local_id.y) * uniforms.N];
      }
      workgroupBarrier();
-     for (var k: u32 = 0u; k<TILE_SIZE && w+k < K; k++) {
+     for (var k: u32 = 0u; k<TILE_SIZE && w+k < uniforms.K; k++) {
        value += tileQ[TILE_SIZE * local_id.y + k] * tileK[TILE_SIZE * k + local_id.x];
      }
      workgroupBarrier();
    }
 
    // we need to transpose output from BNSH_v to BSND_v
-   let batchIdx = workgroup_id.z / ${params.numHeads};
-   let currentBatchHeadNumber = workgroup_id.z % ${params.numHeads};
-   let headOffset = (batchIdx * M * ${params.numHeads} + currentBatchHeadNumber) * ${params.vHeadSize};
-   if (m < M && n < N) {
-     let outputIdx = batchIdx * ${params.sequenceLength * params.vHiddenSize} + m * ${params.vHiddenSize}
-       + currentBatchHeadNumber * ${params.vHeadSize} + n;
+   let batchIdx = workgroup_id.z / uniforms.num_heads;
+   let currentBatchHeadNumber = workgroup_id.z % uniforms.num_heads;
+   let headOffset = (batchIdx * uniforms.M * uniforms.num_heads + currentBatchHeadNumber) * uniforms.N;
+   if (m < uniforms.M && n < uniforms.N) {
+     let outputIdx = batchIdx * uniforms.M *uniforms.v_hidden_size + m * uniforms.v_hidden_size
+       + currentBatchHeadNumber * uniforms.N + n;
      output[outputIdx] = value;
    }
   }`;
+      };
 
       return context.compute(
           {
             name: 'AttentionScore',
-            shaderCache: {hint: JSON.stringify(params)},
+            shaderCache: {inputDependencies: ['type', 'type']},
             getRunData: () => ({
               outputs: [{dims: outputShape, dataType: probs.dataType, gpuDataType: GpuDataType.default}],
               dispatchGroup: dispatch,
+              programUniforms
             }),
             getShaderSource,
           },
@@ -517,71 +515,71 @@ const prepare = (context: ComputeContext, parameters: AttentionParameters) => {
     parameters.sequenceLength,
     parameters.headSize,
   ];
-
-  const dataType = tensorTypeToWsglStorageType(context.inputs[0].dataType);
-
   const M = parameters.sequenceLength;
   const K = parameters.inputHiddenSize;
   const N = parameters.headSize;
-
   const TILE_SIZE = 12;
   const dispatch = {
     x: Math.ceil(parameters.headSize / TILE_SIZE),
     y: Math.ceil(parameters.sequenceLength / TILE_SIZE),
     z: parameters.batchSize * parameters.numHeads
   };
+  const inputs = [context.inputs[0], context.inputs[1], context.inputs[2]];
+  const programUniforms: ProgramUniform[] = [
+    {type: 'uint32', data: M}, {type: 'uint32', data: K}, {type: 'uint32', data: N},
+    {type: 'uint32', data: parameters.numHeads}, {type: 'uint32', data: parameters.headSize},
+    {type: 'uint32', data: parameters.hiddenSize},
+    {type: 'uint32', data: parameters.hiddenSize + parameters.hiddenSize + parameters.vHiddenSize}
+  ];
 
-  const getShaderSource = () => `
-  const M: u32 = ${M}u;
-  const K: u32 = ${K}u;
-  const N: u32 = ${N}u;
-  const numHeads: u32 = ${parameters.numHeads};
-  const ldb = ${parameters.hiddenSize + parameters.hiddenSize + parameters.vHiddenSize}u;
+  const getShaderSource = (shaderHelper: ShaderHelper) => {
+    const outputQ = outputVariable('output_q', inputs[0].dataType, outputShape);
+    const outputK = outputVariable('output_k', inputs[0].dataType, outputShape);
+    const outputV = outputVariable('output_v', inputs[0].dataType, outputShape);
+    const input = inputVariable('input', inputs[0].dataType, inputs[0].dims);
+    const weight = inputVariable('weight', inputs[1].dataType, inputs[1].dims);
+    const bias = inputVariable('bias', inputs[2].dataType, inputs[2].dims);
+    const dataType = input.type.storage;
+
+    const uniforms: UniformsArrayType = [
+      {name: 'M', type: 'u32'}, {name: 'K', type: 'u32'}, {name: 'N', type: 'u32'}, {name: 'num_heads', type: 'u32'},
+      {name: 'head_size', type: 'u32'}, {name: 'hidden_size', type: 'u32'}, {name: 'ldb', type: 'u32'}
+    ];
+    return `
   const TILE_SIZE = ${TILE_SIZE}u;
-
   var<workgroup> tileInput: array<${dataType}, ${TILE_SIZE * TILE_SIZE}>;
   var<workgroup> tileWeightQ: array<${dataType}, ${TILE_SIZE * TILE_SIZE}>;
   var<workgroup> tileWeightK: array<${dataType}, ${TILE_SIZE * TILE_SIZE}>;
   var<workgroup> tileWeightV: array<${dataType}, ${TILE_SIZE * TILE_SIZE}>;
-
-  @group(0) @binding(0) var<storage, read> input: array<${dataType}>;
-  @group(0) @binding(1) var<storage, read> weight: array<${dataType}>;
-  @group(0) @binding(2) var<storage, read> bias: array<${dataType}>;
-  @group(0) @binding(3) var<storage, read_write> outputQ: array<${dataType}>;
-  @group(0) @binding(4) var<storage, read_write> outputK: array<${dataType}>;
-  @group(0) @binding(5) var<storage, read_write> outputV: array<${dataType}>;
-
-  @compute @workgroup_size(${TILE_SIZE}, ${TILE_SIZE}, 1)
-  fn main(@builtin(workgroup_id) workgroup_id : vec3<u32>,
-   @builtin(local_invocation_id) local_id : vec3<u32>, @builtin(local_invocation_index) local_index : u32) {
-   let global_idx = (workgroup_id.z * ${dispatch.x * dispatch.y}u +
-          workgroup_id.y * ${dispatch.x}u + workgroup_id.x) * ${TILE_SIZE * TILE_SIZE}u + local_index;
-
-    let batchIndex = workgroup_id.z / ${parameters.numHeads};
-    let headNumber = workgroup_id.z % ${parameters.numHeads};
+  ${shaderHelper.registerUniforms(uniforms).declareVariables(input, weight, bias, outputQ, outputK, outputV)}
+  ${shaderHelper.mainStart([
+      TILE_SIZE, TILE_SIZE, 1
+    ])}
+    let batchIndex = workgroup_id.z / uniforms.num_heads;
+    let headNumber = workgroup_id.z % uniforms.num_heads;
     let m = workgroup_id.y * TILE_SIZE + local_id.y;
     let n = workgroup_id.x * TILE_SIZE + local_id.x;
 
-    let inputOffset = batchIndex * (M * K) + m * K;
-    let biasOffsetQ = headNumber * ${parameters.headSize};
-    let biasOffsetK = ${parameters.hiddenSize} + biasOffsetQ;
-    let biasOffsetV = ${parameters.hiddenSize} + biasOffsetK;
+    let inputOffset = batchIndex * (uniforms.M * uniforms.K) + m * uniforms.K;
+    let biasOffsetQ = headNumber * uniforms.head_size;
+    let biasOffsetK = uniforms.hidden_size + biasOffsetQ;
+    let biasOffsetV = uniforms.hidden_size + biasOffsetK;
 
     var valueQ = ${dataType}(0);
     var valueK = ${dataType}(0);
     var valueV = ${dataType}(0);
-    for (var w: u32 = 0u; w < K; w += TILE_SIZE) {
-      if (m < M && w + local_id.x < K) {
+    for (var w: u32 = 0u; w < uniforms.K; w += TILE_SIZE) {
+      if (m < uniforms.M && w + local_id.x < uniforms.K) {
         tileInput[TILE_SIZE * local_id.y + local_id.x] = input[inputOffset + w + local_id.x];
       }
-      if (n < N && w + local_id.y < K) {
-        let offset = n + (w + local_id.y) * ldb;
+      if (n < uniforms.N && w + local_id.y < uniforms.K) {
+        let offset = n + (w + local_id.y) * uniforms.ldb;
         tileWeightQ[TILE_SIZE * local_id.y + local_id.x] = weight[biasOffsetQ + offset];
         tileWeightK[TILE_SIZE * local_id.y + local_id.x] = weight[biasOffsetK + offset];
         tileWeightV[TILE_SIZE * local_id.y + local_id.x] = weight[biasOffsetV + offset];
       }
       workgroupBarrier();
-      for (var k: u32 = 0u; k<TILE_SIZE && w+k < K; k++) {
+      for (var k: u32 = 0u; k<TILE_SIZE && w+k < uniforms.K; k++) {
         let inputTileOffset = TILE_SIZE * local_id.y + k;
         let weightTileOffset = TILE_SIZE * k + local_id.x;
         valueQ += tileInput[inputTileOffset] * tileWeightQ[weightTileOffset];
@@ -592,26 +590,25 @@ const prepare = (context: ComputeContext, parameters: AttentionParameters) => {
       workgroupBarrier();
     }
 
-    let headOffset = (m * N + n) % ${parameters.headSize};
+    let headOffset = (m * uniforms.N + n) % uniforms.head_size;
     valueQ += bias[headOffset + biasOffsetQ];
     valueK += bias[headOffset + biasOffsetK];
     valueV += bias[headOffset + biasOffsetV];
 
-    let offset = workgroup_id.z * M * N;
-    if (m < M && n < N) {
-      let outputIdx = offset + m * N + n;
-      outputQ[outputIdx] = valueQ;
-      outputK[outputIdx] = valueK;
-      outputV[outputIdx] = valueV;
+    let offset = workgroup_id.z * uniforms.M * uniforms.N;
+    if (m < uniforms.M && n < uniforms.N) {
+      let outputIdx = offset + m * uniforms.N + n;
+      output_q[outputIdx] = valueQ;
+      output_k[outputIdx] = valueK;
+      output_v[outputIdx] = valueV;
     }
   }`;
-
-  const inputs = [context.inputs[0], context.inputs[1], context.inputs[2]];
+  };
 
   return context.compute(
       {
         name: 'AttentionPrepare',
-        shaderCache: {hint: JSON.stringify(parameters)},
+        shaderCache: {inputDependencies: ['type', 'type', 'type']},
         getRunData: () => ({
           outputs: [
             {dims: outputShape, dataType: context.inputs[0].dataType, gpuDataType: GpuDataType.default},
@@ -619,6 +616,7 @@ const prepare = (context: ComputeContext, parameters: AttentionParameters) => {
             {dims: outputShape, dataType: context.inputs[0].dataType, gpuDataType: GpuDataType.default},
           ],
           dispatchGroup: dispatch,
+          programUniforms
         }),
         getShaderSource,
       },
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/common.ts b/js/web/lib/wasm/jsep/webgpu/ops/common.ts
index 3ce114c5d3884..bc3265be955f0 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/common.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/common.ts
@@ -780,8 +780,10 @@ class ShaderHelperImpl implements ShaderHelper {
 
     const is1DimensionDispatch = this.normalizedDispatchGroup[1] === 1 && this.normalizedDispatchGroup[2] === 1;
     const paramList = is1DimensionDispatch ? `@builtin(global_invocation_id) global_id : vec3<u32>,
+    @builtin(workgroup_id) workgroup_id : vec3<u32>,
     @builtin(local_invocation_id) local_id : vec3<u32>` :
-                                             `@builtin(local_invocation_index) local_idx : u32,
+                                             `@builtin(local_invocation_id) local_id : vec3<u32>,
+    @builtin(local_invocation_index) local_idx : u32,
     @builtin(workgroup_id) workgroup_id : vec3<u32>,
     @builtin(num_workgroups) num_workgroups : vec3<u32>`;
     const globalIdxDefinition = is1DimensionDispatch ?
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/multi-head-attentiion.ts b/js/web/lib/wasm/jsep/webgpu/ops/multi-head-attentiion.ts
index b7726a36bcaad..6d22e3780efd9 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/multi-head-attentiion.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/multi-head-attentiion.ts
@@ -4,10 +4,10 @@
 import {TensorView} from '../../tensor-view';
 import {ShapeUtil} from '../../util';
 import {createAttributeWithCacheKey} from '../attribute-with-cache-key';
-import {ComputeContext, GpuDataType} from '../types';
+import {ComputeContext, GpuDataType, ProgramUniform} from '../types';
 
 import {applyAttention, AttentionAttrs, AttentionMaskType, AttentionParameters, AttentionQkvFormat} from './attention';
-import {ShaderHelper, tensorTypeToWsglStorageType} from './common';
+import {inputVariable, outputVariable, ShaderHelper, UniformsArrayType} from './common';
 import {createTransposeProgramInfo, TransposeAttributes} from './transpose';
 
 const validateInputs = (inputs: readonly TensorView[], attributes: AttentionAttrs): AttentionParameters => {
@@ -228,7 +228,6 @@ const validateInputs = (inputs: readonly TensorView[], attributes: AttentionAttr
   };
 };
 
-
 export const parseMultiHeadAttentionAttributes = (attributes: AttentionAttrs): AttentionAttrs =>
     createAttributeWithCacheKey({...attributes});
 
@@ -239,30 +238,35 @@ const addBiasTranspose =
      hiddenSize: number, biasOffset: number) => {
       const outputShape = [batchSize, sequenceLength, hiddenSize];
       const outputSize = ShapeUtil.size(outputShape);
-
-      const dataType = tensorTypeToWsglStorageType(qkv.dataType);
-      const getShaderSource = (shaderHelper: ShaderHelper) => `
-  const biasOffset = ${biasOffset}u;
-  const hiddenSize = ${hiddenSize}u;
-
-  @group(0) @binding(0) var<storage, read> qkv: array<${dataType}>;
-  @group(0) @binding(1) var<storage, read> bias: array<${dataType}>;
-  @group(0) @binding(2) var<storage, read_write> qkv_with_bias: array<${dataType}>;
-
+      const programUniforms: ProgramUniform[] =
+          [{type: 'uint32', data: outputSize}, {type: 'uint32', data: biasOffset}, {type: 'uint32', data: hiddenSize}];
+
+      const getShaderSource = (shaderHelper: ShaderHelper) => {
+        const output = outputVariable('qkv_with_bias', qkv.dataType, outputShape);
+        const qkvInput = inputVariable('qkv', qkv.dataType, outputShape);
+        const biasInput = inputVariable('bias', bias.dataType, outputShape);
+
+        const uniforms: UniformsArrayType = [
+          {name: 'output_size', type: 'u32'}, {name: 'bias_offset', type: 'u32'}, {name: 'hidden_size', type: 'u32'}
+        ];
+        return `
+  ${shaderHelper.registerUniforms(uniforms).declareVariables(qkvInput, biasInput, output)}
   ${shaderHelper.mainStart()}
-    ${shaderHelper.guardAgainstOutOfBoundsWorkgroupSizes(outputSize)}
-    let biasOffsetIdx = (global_idx % hiddenSize) + biasOffset;
+    ${shaderHelper.guardAgainstOutOfBoundsWorkgroupSizes('uniforms.output_size')}
+    let bias_offset_idx = (global_idx % uniforms.hidden_size) + uniforms.bias_offset;
 
-    qkv_with_bias[global_idx] = qkv[global_idx] + bias[biasOffsetIdx];
+    qkv_with_bias[global_idx] = qkv[global_idx] + bias[bias_offset_idx];
   }`;
+      };
 
       return context.compute(
           {
             name: 'MultiHeadAttentionAddBias',
-            shaderCache: {hint: JSON.stringify({batchSize, sequenceLength, hiddenSize, biasOffset})},
+            shaderCache: {inputDependencies: ['type', 'type']},
             getRunData: () => ({
               outputs: [{dims: outputShape, dataType: qkv.dataType, gpuDataType: GpuDataType.default}],
               dispatchGroup: {x: Math.ceil(outputSize / 64 /* workgroup size */)},
+              programUniforms
             }),
             getShaderSource,
           },

From c1367ae5537162e8030a584d5d3e340d58a1bfe9 Mon Sep 17 00:00:00 2001
From: junchao-loongson <68935141+junchao-loongson@users.noreply.github.com>
Date: Wed, 10 Jan 2024 01:20:45 +0800
Subject: [PATCH 10/29] Sqnbitgemm: add loongarch64 code path (#18775)

### Description

Add support code for loongarch64 platform in sqnbitgemm

```
100% tests passed, 0 tests failed out of 7

Total Test time (real) = 116.99 sec
2023-12-11 10:43:21,287 build [INFO] - Build complete

```
---
 onnxruntime/core/mlas/lib/sqnbitgemm.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/onnxruntime/core/mlas/lib/sqnbitgemm.h b/onnxruntime/core/mlas/lib/sqnbitgemm.h
index f8f7dcd43699f..90fdd710e2773 100644
--- a/onnxruntime/core/mlas/lib/sqnbitgemm.h
+++ b/onnxruntime/core/mlas/lib/sqnbitgemm.h
@@ -232,7 +232,7 @@ MlasSQNBitGemmOperation(
 
         size_t RowsRemaining = RangeCountM;
         while (RowsRemaining > 0) {
-#if defined(MLAS_TARGET_AMD64_IX86) || defined(MLAS_TARGET_POWER)
+#if defined(MLAS_TARGET_AMD64_IX86) || defined(MLAS_TARGET_POWER) || defined(MLAS_TARGET_LARCH64)
             auto RowsHandled = GetMlasPlatform().GemmFloatKernel(
                 a_row, dequant_b, c_blk, K, RowsRemaining, CountN, lda, ldc, 1.f, true
             );

From eb92681bfbb95e93e34f301605529d605f7f9e63 Mon Sep 17 00:00:00 2001
From: Xu Xing <xing.xu@intel.com>
Date: Wed, 10 Jan 2024 01:33:57 +0800
Subject: [PATCH 11/29] [js/webgpu] Support range uniforms (#19055)

---
 js/web/lib/wasm/jsep/webgpu/ops/range.ts | 41 ++++++++++++++++--------
 1 file changed, 27 insertions(+), 14 deletions(-)

diff --git a/js/web/lib/wasm/jsep/webgpu/ops/range.ts b/js/web/lib/wasm/jsep/webgpu/ops/range.ts
index 9cf66111bf707..ed04b0f94bc57 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/range.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/range.ts
@@ -3,10 +3,10 @@
 
 import {env} from 'onnxruntime-common';
 
-import {DataType} from '../../../wasm-common';
-import {ComputeContext, ProgramInfo} from '../types';
+import {DataType, tensorDataTypeEnumToString} from '../../../wasm-common';
+import {ComputeContext, ProgramInfo, ProgramUniform} from '../types';
 
-import {outputVariable, ShaderHelper} from './common';
+import {createTensorShapeVariables, outputVariable, ShaderHelper, UniformDataElementType, UniformsArrayType} from './common';
 
 const validateInputsContent = (start: number, limit: number, delta: number): void => {
   const sameStartLimit = start === limit;
@@ -22,23 +22,36 @@ const createRangeProgramInfo = (start: number, limit: number, delta: number, dat
   const numElements = Math.abs(Math.ceil((limit - start) / delta));
   const outputShape: number[] = [numElements];
   const outputSize = numElements;
+  const tensorDataType = tensorDataTypeEnumToString(dataType) as ProgramUniform['type'];
+  const programUniforms: ProgramUniform[] = [
+    {type: 'uint32', data: outputSize}, {type: tensorDataType, data: start}, {type: tensorDataType, data: delta},
+    ...createTensorShapeVariables(outputShape)
+  ];
 
-  const output = outputVariable('output', dataType, outputShape);
-  const wgslType = output.type.storage;
-
-  const getShaderSource = (shaderHelper: ShaderHelper) => `
-        ${shaderHelper.declareVariables(output)}
+  const getShaderSource = (shaderHelper: ShaderHelper) => {
+    const output = outputVariable('output', dataType, outputShape.length);
+    const wgslType = output.type.value;
+    const uniforms: UniformsArrayType = [
+      {name: 'outputSize', type: 'u32'}, {name: 'start', type: wgslType as UniformDataElementType},
+      {name: 'delta', type: wgslType as UniformDataElementType}
+    ];
+    return `
+        ${shaderHelper.registerUniforms(uniforms).declareVariables(output)}
         ${shaderHelper.mainStart()}
-        ${shaderHelper.guardAgainstOutOfBoundsWorkgroupSizes(outputSize)}
-        output[global_idx] = ${wgslType}(${start}) + ${wgslType}(global_idx) * ${wgslType}(${delta});
+        ${shaderHelper.guardAgainstOutOfBoundsWorkgroupSizes('uniforms.outputSize')}
+        output[global_idx] = uniforms.start + ${wgslType}(global_idx) * uniforms.delta;
       }`;
+  };
+
   return {
     name: 'Range',
-    shaderCache: {hint: [start, limit, delta].map(x => x.toString()).join('_')},
+    shaderCache: {hint: `${dataType}`},
     getShaderSource,
-    getRunData: () => (
-        {outputs: [{dims: outputShape, dataType}],
-         dispatchGroup: {x: Math.ceil(outputSize / 64 /* workgroup size */)}})
+    getRunData: () => ({
+      outputs: [{dims: outputShape, dataType}],
+      dispatchGroup: {x: Math.ceil(outputSize / 64 /* workgroup size */)},
+      programUniforms
+    })
   };
 };
 

From 42ba2aed54458a2b645d859efb830a3c431ec896 Mon Sep 17 00:00:00 2001
From: Xu Xing <xing.xu@intel.com>
Date: Wed, 10 Jan 2024 01:34:56 +0800
Subject: [PATCH 12/29] [js/webgpu] Support pad uniforms (#19057)

### Description
<!-- Describe your changes. -->


### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
---
 .../lib/wasm/jsep/webgpu/op-resolve-rules.ts  |   4 +-
 js/web/lib/wasm/jsep/webgpu/ops/pad.ts        | 222 +++++++++---------
 2 files changed, 107 insertions(+), 119 deletions(-)

diff --git a/js/web/lib/wasm/jsep/webgpu/op-resolve-rules.ts b/js/web/lib/wasm/jsep/webgpu/op-resolve-rules.ts
index c182d3c4eaf6f..ed174bc0982eb 100644
--- a/js/web/lib/wasm/jsep/webgpu/op-resolve-rules.ts
+++ b/js/web/lib/wasm/jsep/webgpu/op-resolve-rules.ts
@@ -20,7 +20,7 @@ import {instanceNorm, parseInstanceNormAttributes} from './ops/instance-norm';
 import {layerNorm} from './ops/layer-norm';
 import {matMul} from './ops/matmul';
 import {multiHeadAttention, parseMultiHeadAttentionAttributes} from './ops/multi-head-attentiion';
-import {pad, parsePadAttributes} from './ops/pad';
+import {pad} from './ops/pad';
 import * as pool from './ops/pool';
 import {range} from './ops/range';
 import {reduceL1, reduceL2, reduceLogSum, reduceLogSumExp, reduceMax, reduceMean, reduceMin, reduceProd, reduceSum, reduceSumSquare} from './ops/reduce';
@@ -95,7 +95,7 @@ export const WEBGPU_OP_RESOLVE_RULES: Map<string, OperatorImplementation> = new
   ['MultiHeadAttention', [multiHeadAttention, parseMultiHeadAttentionAttributes]],
   ['Neg', [unaryOps.neg]],
   ['Not', [unaryOps.not]],
-  ['Pad', [pad, parsePadAttributes]],
+  ['Pad', [pad]],
   ['Pow', [binaryOps.pow]],
   ['Range', [range]],
   ['Reciprocal', [unaryOps.reciprocal]],
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/pad.ts b/js/web/lib/wasm/jsep/webgpu/ops/pad.ts
index 18859e253aa02..eca3fa7d944bb 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/pad.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/pad.ts
@@ -1,15 +1,14 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-import {DataType} from '../../../wasm-common';
+import {DataType, tensorDataTypeEnumToString} from '../../../wasm-common';
 import {TensorView} from '../../tensor-view';
 import {ShapeUtil} from '../../util';
-import {AttributeWithCacheKey, createAttributeWithCacheKey} from '../attribute-with-cache-key';
-import {ComputeContext, ProgramInfo} from '../types';
+import {ComputeContext, ProgramInfo, ProgramInputTensorInfoDependency, ProgramUniform} from '../types';
 
-import {IndicesHelper, inputVariable, outputVariable, ShaderHelper} from './common';
+import {createTensorShapeVariables, getElementAt, IndicesHelper, inputVariable, outputVariable, ShaderHelper, UniformDataElementType, UniformsArrayType} from './common';
 
-export interface PadAttributes extends AttributeWithCacheKey {
+interface PadAttributes {
   // 0-constant, 1-reflect, 2-edge, 3-wrap
   readonly mode: number;
   readonly value: number;
@@ -35,27 +34,23 @@ const validateInputs = (inputs: readonly TensorView[]): void => {
   }
 };
 
-const getPadConstant =
-    (output: IndicesHelper, inputDims: readonly number[], inputStrides: readonly number[], pads: number[],
-     dataType: string, constantValue: number): string => {
-      const inputRank = inputDims.length;
-
-      let block = '';
-      for (let i = inputRank - 1; i >= 0; --i) {
-        block += `
-            k = i32(${output.indicesGet('indices', i)}) - ${pads[i]};
+const getPadConstant = (output: IndicesHelper, inputRank: number, padsLength: number): string => {
+  let block = '';
+  for (let i = inputRank - 1; i >= 0; --i) {
+    block += `
+            k = i32(${output.indicesGet('indices', i)}) - ${getElementAt('uniforms.pads', i, padsLength)};
             if (k < 0) {
               break;
             }
-            if (k >= ${inputDims[i]}) {
+            if (k >= i32(${getElementAt('uniforms.x_shape', i, inputRank)})) {
               break;
             }
-            offset += k * ${inputStrides[i]};
+            offset += k * i32(${getElementAt('uniforms.x_strides', i, inputRank)});
         `;
-      }
+  }
 
-      return `
-          value = ${dataType}(${constantValue});
+  return `
+          value = ${output.type.value}(uniforms.constant_value);
           for (var i = 0; i < 1; i++) {
             var offset = 0;
             var k = 0;
@@ -63,143 +58,143 @@ const getPadConstant =
             value = x[offset];
           }
       `;
-    };
-
-const getPadReflect =
-    (output: IndicesHelper, inputDims: readonly number[], inputStrides: readonly number[], pads: number[]): string => {
-      const inputRank = inputDims.length;
+};
 
-      let block = '';
-      for (let i = inputRank - 1; i >= 0; --i) {
-        block += `
-                k = i32(${output.indicesGet('indices', i)}) - ${pads[i]};
+const getPadReflect = (output: IndicesHelper, inputRank: number, padsLength: number): string => {
+  let block = '';
+  for (let i = inputRank - 1; i >= 0; --i) {
+    block += `
+                k = i32(${output.indicesGet('indices', i)}) - ${getElementAt('uniforms.pads', i, padsLength)};
                 if (k < 0) {
                   k = -k;
                 }
                 {
-                  let _2n_1 = ${2 * (inputDims[i] - 1)};
+                  let _2n_1 = 2 * (i32(${getElementAt('uniforms.x_shape', i, inputRank)}) - 1);
                   k = k % _2n_1;
-                  if(k >= ${inputDims[i]}) {
+                  if(k >= i32(${getElementAt('uniforms.x_shape', i, inputRank)})) {
                     k = _2n_1 - k;
                   }
                 }
-                offset += k * ${inputStrides[i]};
+                offset += k * i32(${getElementAt('uniforms.x_strides', i, inputRank)});
             `;
-      }
+  }
 
-      return `
+  return `
               var offset = 0;
               var k = 0;
               ${block}
               value = x[offset];
           `;
-    };
-
-const getPadEdge =
-    (output: IndicesHelper, inputDims: readonly number[], inputStrides: readonly number[], pads: number[]): string => {
-      const inputRank = inputDims.length;
+};
 
-      let block = '';
-      for (let i = inputRank - 1; i >= 0; --i) {
-        block += `
-                k = i32(${output.indicesGet('indices', i)}) - ${pads[i]};
+const getPadEdge = (output: IndicesHelper, inputRank: number, padsLength: number): string => {
+  let block = '';
+  for (let i = inputRank - 1; i >= 0; --i) {
+    block += `
+                k = i32(${output.indicesGet('indices', i)}) - ${getElementAt('uniforms.pads', i, padsLength)};
                 if (k < 0) {
                   k = 0;
                 }
-                if (k >= ${inputDims[i]}) {
-                  k = ${inputDims[i] - 1};
+                if (k >= i32(${getElementAt('uniforms.x_shape', i, inputRank)})) {
+                  k = i32(${getElementAt('uniforms.x_shape', i, inputRank)}) - 1;
                 }
-                offset += k * ${inputStrides[i]};
+                offset += k * i32(${getElementAt('uniforms.x_strides', i, inputRank)});
             `;
-      }
+  }
 
-      return `
+  return `
               var offset = 0;
               var k = 0;
               ${block}
               value = x[offset];
           `;
-    };
-
-const getPadWrap =
-    (output: IndicesHelper, inputDims: readonly number[], inputStrides: readonly number[], pads: number[]): string => {
-      const inputRank = inputDims.length;
+};
 
-      let block = '';
-      for (let i = inputRank - 1; i >= 0; --i) {
-        block += `
-                k = i32(${output.indicesGet('indices', i)}) - ${pads[i]};
+const getPadWrap = (output: IndicesHelper, inputRank: number, padsLength: number): string => {
+  let block = '';
+  for (let i = inputRank - 1; i >= 0; --i) {
+    block += `
+                k = i32(${output.indicesGet('indices', i)}) - ${getElementAt('uniforms.pads', i, padsLength)};
                 if (k < 0)  {
-                  k += ${inputDims[i]};
+                  k += i32(${getElementAt('uniforms.x_shape', i, inputRank)}]);
                 }
-                if (k >= ${inputDims[i]}) {
-                  k -= ${inputDims[i]};
+                if (k >= i32(${getElementAt('uniforms.x_shape', i, inputRank)})) {
+                  k -= i32(${getElementAt('uniforms.x_shape', i, inputRank)});
                 }
-                offset += k * ${inputStrides[i]};
+                offset += k * i32(${getElementAt('uniforms.x_strides', i, inputRank)});
             `;
-      }
+  }
 
-      return `
+  return `
               var offset = 0;
               var k = 0;
               ${block}
               value = x[offset];
           `;
-    };
-
-const getPadSnippet =
-    (output: IndicesHelper, inputDims: readonly number[], inputStrides: readonly number[], attributes: PadAttributes,
-     dataType: string): string => {
-      switch (attributes.mode) {
-        case 0:
-          return getPadConstant(output, inputDims, inputStrides, attributes.pads, dataType, attributes.value);
-        case 1:
-          return getPadReflect(output, inputDims, inputStrides, attributes.pads);
-        case 2:
-          return getPadEdge(output, inputDims, inputStrides, attributes.pads);
-        case 3:
-          return getPadWrap(output, inputDims, inputStrides, attributes.pads);
-        default:
-          throw new Error('Invalid mode');
-      }
-    };
-
-const generatePadCode =
-    (shaderHelper: ShaderHelper, inputs: readonly TensorView[], attributes: PadAttributes, dataType: string):
-        string => {
-          const inputDims = inputs[0].dims;
-          const outputDims = ShapeUtil.padShape(inputDims.slice(), attributes.pads);
-          const outputSize = ShapeUtil.size(outputDims);
-          const inputStrides = ShapeUtil.computeStrides(inputDims);
-
-          const output = outputVariable('output', inputs[0].dataType, outputDims);
-          const input = inputVariable('x', inputs[0].dataType, inputDims);
-
-          const padSnippet = getPadSnippet(output, inputDims, inputStrides, attributes, dataType);
-          const padCode = `
-              ${shaderHelper.declareVariables(input, output)}
-              ${shaderHelper.mainStart()}
-              ${shaderHelper.guardAgainstOutOfBoundsWorkgroupSizes(outputSize)}
-
-              let indices = ${output.offsetToIndices('global_idx')};
-
-              var value = ${dataType}(0);
-              ${padSnippet}
-              output[global_idx] = value;
-          }`;
-          return padCode;
-        };
+};
+
+const getPadSnippet = (output: IndicesHelper, inputRank: number, attributes: PadAttributes): string => {
+  switch (attributes.mode) {
+    case 0:
+      return getPadConstant(output, inputRank, attributes.pads.length);
+    case 1:
+      return getPadReflect(output, inputRank, attributes.pads.length);
+    case 2:
+      return getPadEdge(output, inputRank, attributes.pads.length);
+    case 3:
+      return getPadWrap(output, inputRank, attributes.pads.length);
+    default:
+      throw new Error('Invalid mode');
+  }
+};
 
 const createPadProgramInfo = (inputs: readonly TensorView[], attributes: PadAttributes): ProgramInfo => {
   const outputShape = ShapeUtil.padShape(inputs[0].dims.slice(), attributes.pads);
+  const inputDims = inputs[0].dims;
+  const outputSize = ShapeUtil.size(outputShape);
+  const programUniforms: ProgramUniform[] =
+      [{type: 'uint32', data: outputSize}, {type: 'uint32', data: attributes.pads}];
+  if (attributes.mode === 0) {
+    const tensorDataType = tensorDataTypeEnumToString(inputs[0].dataType) as ProgramUniform['type'];
+    programUniforms.push({type: tensorDataType, data: attributes.value});
+  }
+
+  programUniforms.push(...createTensorShapeVariables(inputs[0].dims), ...createTensorShapeVariables(outputShape));
+  const inputDependencies: ProgramInputTensorInfoDependency[] = ['rank'];
+
+  const getShaderSource = (shaderHelper: ShaderHelper) => {
+    const output = outputVariable('output', inputs[0].dataType, outputShape.length);
+    const input = inputVariable('x', inputs[0].dataType, inputDims.length);
+    const dataType = input.type.value;
+    const padSnippet = getPadSnippet(output, inputDims.length, attributes);
+    const uniforms: UniformsArrayType =
+        [{name: 'output_size', type: 'u32'}, {name: 'pads', type: 'i32', length: attributes.pads.length}];
+    if (attributes.mode === 0) {
+      uniforms.push({name: 'constant_value', type: dataType as UniformDataElementType});
+    }
+
+    return `
+            ${shaderHelper.registerUniforms(uniforms).declareVariables(input, output)}
+            ${shaderHelper.mainStart()}
+            ${shaderHelper.guardAgainstOutOfBoundsWorkgroupSizes('uniforms.output_size')}
+
+            let indices = ${output.offsetToIndices('global_idx')};
+
+            var value = ${dataType}(0);
+            ${padSnippet}
+            output[global_idx] = value;
+        }`;
+  };
+
   return {
     name: 'Pad',
-    shaderCache: {hint: attributes.cacheKey},
+    shaderCache: {hint: `${attributes.mode}`, inputDependencies},
     getRunData: () => ({
       outputs: [{dims: outputShape, dataType: inputs[0].dataType}],
-      dispatchGroup: {x: Math.ceil(ShapeUtil.size(outputShape) / 64 /* workgroup size */)}
+      dispatchGroup: {x: Math.ceil(ShapeUtil.size(outputShape) / 64 /* workgroup size */)},
+      programUniforms
     }),
-    getShaderSource: shaderHelper => generatePadCode(shaderHelper, inputs, attributes, 'f32'),
+    getShaderSource,
   };
 };
 
@@ -223,7 +218,7 @@ const createPadAttributesFromInputs = (inputs: readonly TensorView[], attributes
     const pads: number[] = [];
     updatePads.forEach(v => pads.push(v));
 
-    return createAttributeWithCacheKey({mode: attributes.mode, value, pads});
+    return {mode: attributes.mode, value, pads};
   } else {
     return attributes;
   }
@@ -234,10 +229,3 @@ export const pad = (context: ComputeContext, attributes: PadAttributes): void =>
   const updatedAttributes = createPadAttributesFromInputs(context.inputs, attributes);
   context.compute(createPadProgramInfo(context.inputs, updatedAttributes), {inputs: [0]});
 };
-
-export const parsePadAttributes = (attributes: Record<string, unknown>): PadAttributes => {
-  const mode = attributes.mode as number;
-  const value = attributes.value as number;
-  const pads = attributes.pads as number[];
-  return createAttributeWithCacheKey({mode, value, pads});
-};

From 557ac74c0565b7cfdb68bb79724d7ed1b870c121 Mon Sep 17 00:00:00 2001
From: Xu Xing <xing.xu@intel.com>
Date: Wed, 10 Jan 2024 01:57:06 +0800
Subject: [PATCH 13/29] [js/webgpu] Support gemm uniforms (#19056)

### Description
<!-- Describe your changes. -->


### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
---
 js/web/lib/wasm/jsep/webgpu/ops/gemm.ts | 123 ++++++++++++------------
 1 file changed, 64 insertions(+), 59 deletions(-)

diff --git a/js/web/lib/wasm/jsep/webgpu/ops/gemm.ts b/js/web/lib/wasm/jsep/webgpu/ops/gemm.ts
index 1c5d28e4b8e3f..30754c84413b7 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/gemm.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/gemm.ts
@@ -3,10 +3,10 @@
 
 import {TensorView} from '../../tensor-view';
 import {GemmUtil, ShapeUtil} from '../../util';
-import {AttributeWithCacheKey, createAttributeWithCacheKey} from '../attribute-with-cache-key';
-import {ComputeContext, ProgramInfo} from '../types';
+import {AttributeWithCacheKey} from '../attribute-with-cache-key';
+import {ComputeContext, ProgramInfo, ProgramInputTensorInfoDependency, ProgramUniform} from '../types';
 
-import {ShaderHelper, tensorTypeToWsglStorageType} from './common';
+import {createTensorShapeVariables, IndicesHelper, inputVariable, outputVariable, ShaderHelper, UniformsArrayType} from './common';
 
 const validateInputs = (inputs: readonly TensorView[]): void => {
   if (!inputs) {
@@ -34,25 +34,6 @@ export interface GemmAttributes extends AttributeWithCacheKey {
   beta: number;
 }
 
-const offsetC = (m: number, n: number, dims: readonly number[]): string => {
-  if (dims.length === 0) {
-    return '0u';
-  }
-
-  const broadcastM = (dims.length === 1 && m !== 1) || (dims.length === 2 && dims[0] !== m);
-  const broadcastN = dims[dims.length - 1] !== n;
-
-  let offset = '0u';
-  if (!broadcastM) {
-    offset += `+ m * ${dims[dims.length - 1]}u`;
-  }
-  if (!broadcastN) {
-    offset += '+n';
-  }
-
-  return offset;
-};
-
 const createGemmProgramInfo = (inputs: readonly TensorView[], attributes: GemmAttributes): ProgramInfo => {
   const aShape = inputs[0].dims.slice();
   const bShape = inputs[1].dims.slice();
@@ -63,68 +44,92 @@ const createGemmProgramInfo = (inputs: readonly TensorView[], attributes: GemmAt
     throw new Error('Can\'t use gemm on the given tensors');
   }
   const outputSize = ShapeUtil.size(outputShape);
-  let line = '';
-  if (attributes.transA && attributes.transB) {
-    line = 'value += a[k * M + m] * b[n * K + k];';
-  } else if (attributes.transA && !attributes.transB) {
-    line = 'value += a[k * M + m] * b[k * N + n];';
-  } else if (!attributes.transA && attributes.transB) {
-    line = 'value += a[m * K + k] * b[n * K + k];';
-  } else if (!attributes.transA && !attributes.transB) {
-    line = 'value += a[m * K + k] * b[k * N + n];';
-  }
-
-  const dataType = tensorTypeToWsglStorageType(inputs[0].dataType);
-  const calculateAlpha = attributes.alpha === 1 ? '' : 'value *= alpha;';
-  const calculateC = inputs.length === 3 ? `value += beta * c[${offsetC(M, N, inputs[2].dims)}];` : '';
-  const inputStorageBuffersDeclarations = [
-    `@group(0) @binding(0) var<storage, read> a : array<${dataType}>;`,
-    `@group(0) @binding(1) var<storage, read> b : array<${dataType}>;`
+  const programUniforms: ProgramUniform[] = [
+    {type: 'uint32', data: outputSize}, {type: 'uint32', data: M}, {type: 'uint32', data: N}, {type: 'uint32', data: K},
+    {type: 'float32', data: attributes.alpha}, {type: 'float32', data: attributes.beta}
   ];
+  const inputDependencies: ProgramInputTensorInfoDependency[] = ['type', 'type'];
   if (inputs.length === 3) {
-    inputStorageBuffersDeclarations.push(`@group(0) @binding(2) var<storage, read> c : array<${dataType}>;`);
+    programUniforms.push(...createTensorShapeVariables(inputs[2].dims));
+    inputDependencies.push('rank');
   }
-  const getShaderSource = (shaderHelper: ShaderHelper) => `
-  const M: u32 = ${M}u;
-  const N: u32 = ${N}u;
-  const K: u32 = ${K}u;
-  const alpha = ${dataType}(${attributes.alpha});
-  const beta = ${dataType}(${attributes.beta});
+  programUniforms.push(...createTensorShapeVariables(outputShape));
+
+  const getShaderSource = (shaderHelper: ShaderHelper) => {
+    let line = '';
+    if (attributes.transA && attributes.transB) {
+      line = 'value += a[k * uniforms.M + m] * b[n * uniforms.K + k];';
+    } else if (attributes.transA && !attributes.transB) {
+      line = 'value += a[k * uniforms.M + m] * b[k * uniforms.N + n];';
+    } else if (!attributes.transA && attributes.transB) {
+      line = 'value += a[m * uniforms.K + k] * b[n * uniforms.K + k];';
+    } else if (!attributes.transA && !attributes.transB) {
+      line = 'value += a[m * uniforms.K + k] * b[k * uniforms.N + n];';
+    }
 
-  ${inputStorageBuffersDeclarations.join('\n')}
-  @group(0) @binding(${inputs.length}) var<storage, read_write> output : array<${dataType}>;
+    const calculateAlpha = attributes.alpha === 1 ? '' : 'value *= uniforms.alpha;';
+    const a = inputVariable('a', inputs[0].dataType, inputs[0].dims);
+    const b = inputVariable('b', inputs[1].dataType, inputs[1].dims);
+    const dataType = a.type.value;
+    let c: IndicesHelper|null = null;
+    const variables = [a, b];
+    if (inputs.length === 3) {
+      c = inputVariable('c', inputs[2].dataType, inputs[2].dims.length);
+      variables.push(c);
+    }
+    const output = outputVariable('output', inputs[0].dataType, outputShape.length);
+    variables.push(output);
+    const uniforms: UniformsArrayType = [
+      {name: 'output_size', type: 'u32'}, {name: 'M', type: 'u32'}, {name: 'N', type: 'u32'}, {name: 'K', type: 'u32'},
+      {name: 'alpha', type: 'f32'}, {name: 'beta', type: 'f32'}
+    ];
+    return `
+  ${shaderHelper.registerUniforms(uniforms).declareVariables(...variables)}
 
   ${shaderHelper.mainStart()}
-    ${shaderHelper.guardAgainstOutOfBoundsWorkgroupSizes(outputSize)}
+    ${shaderHelper.guardAgainstOutOfBoundsWorkgroupSizes('uniforms.output_size')}
 
-    let m = global_idx / N;
-    let n = global_idx % N;
+    let m = global_idx / uniforms.N;
+    let n = global_idx % uniforms.N;
 
     var value = ${dataType}(0);
-    for (var k: u32 = 0u; k<${K}u; k++) {
+    for (var k: u32 = 0u; k < uniforms.K; k++) {
       ${line}
     }
 
     ${calculateAlpha}
-    ${calculateC}
+    ${(() => {
+      if (c != null) {
+        return `let cOffset = ${c.broadcastedIndicesToOffset('vec2(m, n)', output)}; value += uniforms.beta * ${
+            c.getByOffset('cOffset')};`;
+      }
+      return '';
+    })()}
     output[global_idx] = value;
-
   }`;
+  };
+
   return {
     name: 'Gemm',
-    shaderCache: {hint: attributes.cacheKey},
+    shaderCache: {hint: `${attributes.cacheKey}`, inputDependencies},
     getRunData: () => ({
       outputs: [{dims: outputShape, dataType: inputs[0].dataType}],
-      dispatchGroup: {x: Math.ceil(outputSize / 64 /* workgroup size */)}
+      dispatchGroup: {x: Math.ceil(outputSize / 64 /* workgroup size */)},
+      programUniforms
     }),
     getShaderSource,
   };
 };
 
+export const parseGemmAttributes = (attributes: Record<string, unknown>): GemmAttributes => {
+  const transA = attributes.transA as boolean;
+  const transB = attributes.transB as boolean;
+  const alpha = attributes.alpha as number;
+  const beta = attributes.beta as number;
+  return {transA, transB, alpha, beta, cacheKey: `${attributes.transA};${attributes.transB};${attributes.alpha === 1}`};
+};
+
 export const gemm = (context: ComputeContext, attributes: GemmAttributes): void => {
   validateInputs(context.inputs);
   context.compute(createGemmProgramInfo(context.inputs, attributes));
 };
-
-export const parseGemmAttributes = (attributes: Record<string, unknown>): GemmAttributes =>
-    createAttributeWithCacheKey(attributes as Omit<GemmAttributes, keyof AttributeWithCacheKey>);

From ad6dd0a597256d980a357751c347670c3bdc824f Mon Sep 17 00:00:00 2001
From: zesongw <zesong.wang@intel.com>
Date: Wed, 10 Jan 2024 02:10:57 +0800
Subject: [PATCH 14/29] [WebNN] Enable npm unit tests (#18486)

### Description
- Support more test cases for WebNN EP in suite-test-list.jsonc
- Add DISABLE_WEBNN flag in build.ts as preparing for WebNN EP release
- Add test option: '--webnn-device-type' in test-runner-args-cli.ts to
support running WebNN 'gpu' deviceType
- Use Chrome Stable as default browser for WebNN testing to unblock the
CI limitation.
---
 js/web/lib/build-def.d.ts             |    4 +
 js/web/lib/index.ts                   |    4 +-
 js/web/script/build.ts                |   14 +-
 js/web/script/test-runner-cli-args.ts |   12 +
 js/web/script/test-runner-cli.ts      |   14 +-
 js/web/test/suite-test-list.jsonc     | 1098 +++++++++++++++++++++++--
 js/web/test/test-main.ts              |    3 +-
 js/web/test/test-runner.ts            |   16 +-
 js/web/test/test-types.ts             |    1 +
 9 files changed, 1072 insertions(+), 94 deletions(-)

diff --git a/js/web/lib/build-def.d.ts b/js/web/lib/build-def.d.ts
index fb714bf5996f1..b3868871a4753 100644
--- a/js/web/lib/build-def.d.ts
+++ b/js/web/lib/build-def.d.ts
@@ -18,6 +18,10 @@ interface BuildDefinitions {
    * defines whether to disable the whole WebGpu backend in the build.
    */
   readonly DISABLE_WEBGPU: boolean;
+  /**
+   * defines whether to disable the whole WebNN backend in the build.
+   */
+  readonly DISABLE_WEBNN: boolean;
   /**
    * defines whether to disable the whole WebAssembly backend in the build.
    */
diff --git a/js/web/lib/index.ts b/js/web/lib/index.ts
index 499327741c82b..4f1a3943de69a 100644
--- a/js/web/lib/index.ts
+++ b/js/web/lib/index.ts
@@ -28,7 +28,9 @@ if (!BUILD_DEFS.DISABLE_WASM) {
   registerBackend('wasm', wasmBackend, 10);
   if (BUILD_DEFS.DISABLE_TRAINING) {
     registerBackend('xnnpack', wasmBackend, 9);
-    registerBackend('webnn', wasmBackend, 9);
+    if (!BUILD_DEFS.DISABLE_WEBNN) {
+      registerBackend('webnn', wasmBackend, 9);
+    }
   }
 }
 
diff --git a/js/web/script/build.ts b/js/web/script/build.ts
index 5151f27582c1f..11892881a2adf 100644
--- a/js/web/script/build.ts
+++ b/js/web/script/build.ts
@@ -44,6 +44,7 @@ const SOURCE_ROOT_FOLDER = path.join(__dirname, '../..');  // <ORT_ROOT>/js/
 const DEFAULT_DEFINE = {
   'BUILD_DEFS.DISABLE_WEBGL': 'false',
   'BUILD_DEFS.DISABLE_WEBGPU': 'false',
+  'BUILD_DEFS.DISABLE_WEBNN': 'false',
   'BUILD_DEFS.DISABLE_WASM': 'false',
   'BUILD_DEFS.DISABLE_WASM_PROXY': 'false',
   'BUILD_DEFS.DISABLE_WASM_THREAD': 'false',
@@ -359,6 +360,7 @@ async function main() {
         ...DEFAULT_DEFINE,
         'BUILD_DEFS.DISABLE_WEBGPU': 'true',
         'BUILD_DEFS.DISABLE_WEBGL': 'true',
+        'BUILD_DEFS.DISABLE_WEBNN': 'true',
         'BUILD_DEFS.DISABLE_WASM_PROXY': 'true',
         'BUILD_DEFS.DISABLE_WASM_THREAD': 'true',
       },
@@ -370,6 +372,7 @@ async function main() {
     await addBuildTask(buildOrt({
       outputBundleName: 'ort.all',
       format: 'iife',
+      define: {...DEFAULT_DEFINE}
     }));
   }
 
@@ -394,7 +397,7 @@ async function main() {
     // ort.webgpu[.min].js
     await addAllWebBuildTasks({
       outputBundleName: 'ort.webgpu',
-      define: {...DEFAULT_DEFINE, 'BUILD_DEFS.DISABLE_WEBGL': 'true'},
+      define: {...DEFAULT_DEFINE, 'BUILD_DEFS.DISABLE_WEBGL': 'true', 'BUILD_DEFS.DISABLE_WEBNN': 'true'},
     });
     // ort.wasm[.min].js
     await addAllWebBuildTasks({
@@ -404,7 +407,12 @@ async function main() {
     // ort.webgl[.min].js
     await addAllWebBuildTasks({
       outputBundleName: 'ort.webgl',
-      define: {...DEFAULT_DEFINE, 'BUILD_DEFS.DISABLE_WEBGPU': 'true', 'BUILD_DEFS.DISABLE_WASM': 'true'},
+      define: {
+        ...DEFAULT_DEFINE,
+        'BUILD_DEFS.DISABLE_WEBGPU': 'true',
+        'BUILD_DEFS.DISABLE_WASM': 'true',
+        'BUILD_DEFS.DISABLE_WEBNN': 'true',
+      },
     });
     // ort.wasm-core[.min].js
     await addAllWebBuildTasks({
@@ -413,6 +421,7 @@ async function main() {
         ...DEFAULT_DEFINE,
         'BUILD_DEFS.DISABLE_WEBGPU': 'true',
         'BUILD_DEFS.DISABLE_WEBGL': 'true',
+        'BUILD_DEFS.DISABLE_WEBNN': 'true',
         'BUILD_DEFS.DISABLE_WASM_PROXY': 'true',
         'BUILD_DEFS.DISABLE_WASM_THREAD': 'true',
       },
@@ -425,6 +434,7 @@ async function main() {
         'BUILD_DEFS.DISABLE_TRAINING': 'false',
         'BUILD_DEFS.DISABLE_WEBGPU': 'true',
         'BUILD_DEFS.DISABLE_WEBGL': 'true',
+        'BUILD_DEFS.DISABLE_WEBNN': 'true',
       },
     });
   }
diff --git a/js/web/script/test-runner-cli-args.ts b/js/web/script/test-runner-cli-args.ts
index ee955ec8d4f17..fc74adfed1fee 100644
--- a/js/web/script/test-runner-cli-args.ts
+++ b/js/web/script/test-runner-cli-args.ts
@@ -79,6 +79,7 @@ Options:
  --webgl-texture-cache-mode    Set the WebGL texture cache mode (initializerOnly/full)
  --webgl-texture-pack-mode     Set the WebGL texture pack mode (true/false)
  --webgpu-profiling-mode       Set the WebGPU profiling mode (off/default)
+ --webnn-device-type           Set the WebNN device type (cpu/gpu)
 
 *** Browser Options ***
 
@@ -174,6 +175,7 @@ export interface TestRunnerCliArgs {
   cudaFlags?: Record<string, unknown>;
   wasmOptions?: InferenceSession.WebAssemblyExecutionProviderOption;
   webglOptions?: InferenceSession.WebGLExecutionProviderOption;
+  webnnOptions?: InferenceSession.WebNNExecutionProviderOption;
   globalEnvFlags?: Test.Options['globalEnvFlags'];
   noSandbox?: boolean;
   chromiumFlags: string[];
@@ -335,6 +337,14 @@ function parseWebgpuFlags(args: minimist.ParsedArgs): Partial<Env.WebGpuFlags> {
   return {profilingMode, validateInputContent};
 }
 
+function parseWebNNOptions(args: minimist.ParsedArgs): InferenceSession.WebNNExecutionProviderOption {
+  const deviceType = args['webnn-device-type'];
+  if (deviceType !== undefined && deviceType !== 'cpu' && deviceType !== 'gpu') {
+    throw new Error('Flag "webnn-device-type" is invalid');
+  }
+  return {name: 'webnn', deviceType};
+}
+
 function parseGlobalEnvFlags(args: minimist.ParsedArgs): NonNullable<TestRunnerCliArgs['globalEnvFlags']> {
   const wasm = parseWasmFlags(args);
   const webgl = parseWebglFlags(args);
@@ -449,6 +459,7 @@ export function parseTestRunnerCliArgs(cmdlineArgs: string[]): TestRunnerCliArgs
   const wasmOptions = parseWasmOptions(args);
 
   const webglOptions = parseWebglOptions(args);
+  const webnnOptions = parseWebNNOptions(args);
 
   // Option: --no-sandbox
   const noSandbox = !!args['no-sandbox'];
@@ -487,6 +498,7 @@ export function parseTestRunnerCliArgs(cmdlineArgs: string[]): TestRunnerCliArgs
     fileCache,
     cpuOptions,
     webglOptions,
+    webnnOptions,
     wasmOptions,
     globalEnvFlags,
     noSandbox,
diff --git a/js/web/script/test-runner-cli.ts b/js/web/script/test-runner-cli.ts
index 74a03290332a8..d56792c6e3595 100644
--- a/js/web/script/test-runner-cli.ts
+++ b/js/web/script/test-runner-cli.ts
@@ -165,6 +165,7 @@ async function main() {
       debug: args.debug,
       cpuOptions: args.cpuOptions,
       webglOptions: args.webglOptions,
+      webnnOptions: args.webnnOptions,
       wasmOptions: args.wasmOptions,
       globalEnvFlags: args.globalEnvFlags
     }
@@ -499,7 +500,7 @@ async function main() {
           args.bundleMode === 'perf' ? 'perf' :
               args.debug             ? 'debug' :
                                        'test',
-          webgpu, webnn);
+          webgpu);
       const karmaArgs = ['karma', 'start', `--browsers ${browser}`];
       const chromiumFlags = ['--enable-features=SharedArrayBuffer', ...args.chromiumFlags];
       if (args.debug) {
@@ -614,11 +615,10 @@ async function main() {
     fs.writeJSONSync(path.join(TEST_ROOT, './testdata-config.json'), config);
   }
 
-  function getBrowserNameFromEnv(
-      env: TestRunnerCliArgs['env'], mode: 'debug'|'perf'|'test', webgpu: boolean, webnn: boolean) {
+  function getBrowserNameFromEnv(env: TestRunnerCliArgs['env'], mode: 'debug'|'perf'|'test', webgpu: boolean) {
     switch (env) {
       case 'chrome':
-        return selectChromeBrowser(mode, webgpu, webnn);
+        return selectChromeBrowser(mode, webgpu);
       case 'edge':
         return 'EdgeTest';
       case 'firefox':
@@ -634,10 +634,8 @@ async function main() {
     }
   }
 
-  function selectChromeBrowser(mode: 'debug'|'perf'|'test', webgpu: boolean, webnn: boolean) {
-    if (webnn) {
-      return 'ChromeCanaryTest';
-    } else if (webgpu) {
+  function selectChromeBrowser(mode: 'debug'|'perf'|'test', webgpu: boolean) {
+    if (webgpu) {
       return 'ChromeTest';
     } else {
       switch (mode) {
diff --git a/js/web/test/suite-test-list.jsonc b/js/web/test/suite-test-list.jsonc
index 594ce9feed31e..79f42e36bf390 100644
--- a/js/web/test/suite-test-list.jsonc
+++ b/js/web/test/suite-test-list.jsonc
@@ -1501,85 +1501,424 @@
   "webnn": {
     "onnx": ["resnet50", "squeezenet", "tiny_yolov2", "emotion_ferplus"],
     "node": [
-      // Check in node tests that have native Wasm implementations.
-      // (i.e.) not tests that rely on the fallback cpu implementations.
-      // Use the 'cpu' level of node tests to test those implementations.
+      "test_abs",
+      "test_acos_example",
+      "test_acos",
+      "test_acosh_example",
+      "test_acosh",
+      // // "test_adagrad_multiple",
+      // // "test_adagrad",
+      // // "test_adam_multiple",
+      // // "test_adam",
       "test_add_bcast",
+      // "test_add_uint8",
       "test_add",
-      "test_sub_bcast",
-      "test_sub_example",
-      "test_sub",
-      "test_mul_bcast",
-      "test_mul_example",
-      "test_mul",
-      "test_div_bcast",
-      "test_div_example",
-      "test_div",
-      "test_xor_bcast3v1d",
-      "test_xor_bcast3v2d",
-      "test_xor_bcast4v2d",
-      "test_xor_bcast4v3d",
-      "test_xor_bcast4v4d",
-      "test_xor2d",
-      "test_xor3d",
-      "test_xor4d",
-      "test_or_bcast3v1d",
-      "test_or_bcast3v2d",
-      "test_or_bcast4v2d",
-      "test_or_bcast4v3d",
-      "test_or_bcast4v4d",
-      "test_and_bcast3v1d",
-      "test_and_bcast3v2d",
-      "test_and_bcast4v2d",
-      "test_and_bcast4v3d",
-      "test_and_bcast4v4d",
-      "test_and2d",
-      "test_and3d",
-      "test_and4d",
-      "test_prelu_broadcast",
-      "test_prelu_example",
+      // "test_and_bcast3v1d",
+      // "test_and_bcast3v2d",
+      // "test_and_bcast4v2d",
+      // "test_and_bcast4v3d",
+      // "test_and_bcast4v4d",
+      // "test_and2d",
+      // "test_and3d",
+      // "test_and4d",
+      // "test_argmax_default_axis_example_select_last_index",
+      // "test_argmax_default_axis_example",
+      // "test_argmax_default_axis_random_select_last_index",
+      // "test_argmax_default_axis_random",
+      // "test_argmax_keepdims_example_select_last_index",
+      // "test_argmax_keepdims_example",
+      // "test_argmax_keepdims_random_select_last_index",
+      // "test_argmax_keepdims_random",
+      // "test_argmax_negative_axis_keepdims_example_select_last_index",
+      // "test_argmax_negative_axis_keepdims_example",
+      // "test_argmax_negative_axis_keepdims_random_select_last_index",
+      // "test_argmax_negative_axis_keepdims_random",
+      // "test_argmax_no_keepdims_example_select_last_index",
+      // "test_argmax_no_keepdims_example",
+      // "test_argmax_no_keepdims_random_select_last_index",
+      // "test_argmax_no_keepdims_random",
+      // "test_argmin_default_axis_example_select_last_index",
+      // "test_argmin_default_axis_example",
+      // "test_argmin_default_axis_random_select_last_index",
+      // "test_argmin_default_axis_random",
+      // "test_argmin_keepdims_example_select_last_index",
+      // "test_argmin_keepdims_example",
+      // "test_argmin_keepdims_random_select_last_index",
+      // "test_argmin_keepdims_random",
+      // "test_argmin_negative_axis_keepdims_example_select_last_index",
+      // "test_argmin_negative_axis_keepdims_example",
+      // "test_argmin_negative_axis_keepdims_random_select_last_index",
+      // "test_argmin_negative_axis_keepdims_random",
+      // "test_argmin_no_keepdims_example_select_last_index",
+      // "test_argmin_no_keepdims_example",
+      // "test_argmin_no_keepdims_random_select_last_index",
+      // "test_argmin_no_keepdims_random",
+      // "test_asin_example",
+      // "test_asin",
+      // "test_asinh_example",
+      // "test_asinh",
+      // "test_atan_example",
+      // "test_atan",
+      // "test_atanh_example",
+      // "test_atanh",
+      // "test_averagepool_1d_default",
+      // "test_averagepool_2d_ceil",
+      "test_averagepool_2d_default",
+      "test_averagepool_2d_pads_count_include_pad",
+      "test_averagepool_2d_pads",
+      "test_averagepool_2d_precomputed_pads_count_include_pad",
+      "test_averagepool_2d_precomputed_pads",
+      "test_averagepool_2d_precomputed_same_upper",
+      "test_averagepool_2d_precomputed_strides",
+      "test_averagepool_2d_same_lower",
+      "test_averagepool_2d_same_upper",
+      "test_averagepool_2d_strides",
+      // "test_averagepool_3d_default",
       "test_basic_conv_with_padding",
       "test_basic_conv_without_padding",
-      "test_batchnorm_epsilon",
-      "test_batchnorm_example",
-      "opset{10,11,12}/test_cast_STRING_to_FLOAT",
-      "test_clip_splitbounds",
-      "test_clip_outbounds",
-      "test_clip_inbounds",
-      "test_clip_example",
-      "test_clip_default_min",
-      "test_clip_default_max",
+      // "test_basic_convinteger",
+      // "test_batchnorm_epsilon_training_mode",
+      // "test_batchnorm_epsilon",
+      // "test_batchnorm_example_training_mode",
+      // "test_batchnorm_example",
+      // // "test_bernoulli_double_expanded",
+      // // "test_bernoulli_double",
+      // // "test_bernoulli_expanded",
+      // // "test_bernoulli_seed_expanded",
+      // // "test_bernoulli_seed",
+      // // "test_bernoulli",
+      // // "test_bitshift_left_uint16",
+      // // "test_bitshift_left_uint32",
+      // // "test_bitshift_left_uint64",
+      // // "test_bitshift_left_uint8",
+      // // "test_bitshift_right_uint16",
+      // // "test_bitshift_right_uint32",
+      // // "test_bitshift_right_uint64",
+      // // "test_bitshift_right_uint8",
+      // // "test_blackmanwindow_expanded",
+      // // "test_blackmanwindow_symmetric_expanded",
+      // // "test_blackmanwindow_symmetric",
+      // // "test_blackmanwindow",
+      // // "test_cast_BFLOAT16_to_FLOAT",
+      // // "test_cast_DOUBLE_to_FLOAT",
+      // // "test_cast_DOUBLE_to_FLOAT16",
+      // // "test_cast_FLOAT_to_BFLOAT16",
+      // // "test_cast_FLOAT_to_DOUBLE",
+      // // "test_cast_FLOAT_to_FLOAT16",
+      // // "test_cast_FLOAT_to_STRING",
+      // // "test_cast_FLOAT16_to_DOUBLE",
+      // // "test_cast_FLOAT16_to_FLOAT",
+      // // "test_cast_STRING_to_FLOAT",
+      // // "test_castlike_BFLOAT16_to_FLOAT_expanded",
+      // // "test_castlike_BFLOAT16_to_FLOAT",
+      // // "test_castlike_DOUBLE_to_FLOAT_expanded",
+      // // "test_castlike_DOUBLE_to_FLOAT",
+      // // "test_castlike_DOUBLE_to_FLOAT16_expanded",
+      // // "test_castlike_DOUBLE_to_FLOAT16",
+      // // "test_castlike_FLOAT_to_BFLOAT16_expanded",
+      // // "test_castlike_FLOAT_to_BFLOAT16",
+      // // "test_castlike_FLOAT_to_DOUBLE_expanded",
+      // // "test_castlike_FLOAT_to_DOUBLE",
+      // // "test_castlike_FLOAT_to_FLOAT16_expanded",
+      // // "test_castlike_FLOAT_to_FLOAT16",
+      // // "test_castlike_FLOAT_to_STRING_expanded",
+      // // "test_castlike_FLOAT_to_STRING",
+      // // "test_castlike_FLOAT16_to_DOUBLE_expanded",
+      // // "test_castlike_FLOAT16_to_DOUBLE",
+      // // "test_castlike_FLOAT16_to_FLOAT_expanded",
+      // // "test_castlike_FLOAT16_to_FLOAT",
+      // // "test_castlike_STRING_to_FLOAT_expanded",
+      // // "test_castlike_STRING_to_FLOAT",
+      "test_ceil_example",
+      "test_ceil",
+      // "test_celu_expanded",
+      // "test_celu",
       "test_clip_default_inbounds",
+      "test_clip_default_int8_inbounds",
+      "test_clip_default_int8_max",
+      "test_clip_default_int8_min",
+      "test_clip_default_max",
+      "test_clip_default_min",
+      "test_clip_example",
+      "test_clip_inbounds",
+      "test_clip_outbounds",
+      "test_clip_splitbounds",
       "test_clip",
+      // // "test_compress_0",
+      // // "test_compress_1",
+      // // "test_compress_default_axis",
+      // // "test_compress_negative_axis",
+      "test_concat_1d_axis_0",
+      "test_concat_1d_axis_negative_1",
+      "test_concat_2d_axis_0",
+      "test_concat_2d_axis_1",
+      "test_concat_2d_axis_negative_1",
+      "test_concat_2d_axis_negative_2",
+      "test_concat_3d_axis_0",
+      "test_concat_3d_axis_1",
+      "test_concat_3d_axis_2",
+      "test_concat_3d_axis_negative_1",
+      "test_concat_3d_axis_negative_2",
+      "test_concat_3d_axis_negative_3",
+      "test_conv_with_autopad_same",
       "test_conv_with_strides_and_asymmetric_padding",
       "test_conv_with_strides_no_padding",
       "test_conv_with_strides_padding",
-      "test_gemm_nobroadcast",
+      // // "test_convinteger_with_padding",
+      // // "test_convinteger_without_padding",
+      // "test_convtranspose_1d",
+      // // "test_convtranspose_3d",
+      // "test_convtranspose_autopad_same",
+      "test_convtranspose_dilations",
+      "test_convtranspose_kernel_shape",
+      "opset{9,17}/test_convtranspose_output_shape",
+      "test_convtranspose_pad",
+      "test_convtranspose_pads",
+      "test_convtranspose_with_kernel",
+      "test_convtranspose",
+      "test_cos_example",
+      "test_cos",
+      // "test_cosh_example",
+      // "test_cosh",
+      // "test_cumsum_1d_exclusive",
+      // "test_cumsum_1d_reverse_exclusive",
+      // "test_cumsum_1d_reverse",
+      // "test_cumsum_1d",
+      // "test_cumsum_2d_axis_0",
+      // "test_cumsum_2d_axis_1",
+      // "test_cumsum_2d_negative_axis",
+      // "test_depthtospace_crd_mode_example",
+      // "test_depthtospace_crd_mode",
+      // "test_depthtospace_dcr_mode",
+      // "test_depthtospace_example",
+      // "test_depthtospace",
+      // // "test_dequantizelinear_axis",
+      // // "test_dequantizelinear",
+      // // "test_det_2d",
+      // // "test_det_nd",
+      // // "test_dft_axis",
+      // // "test_dft_inverse",
+      // // "test_dft",
+      "test_div_bcast",
+      "test_div_example",
+      // "test_div_uint8",
+      "test_div",
+      // // "test_dropout_default_mask_ratio",
+      // // "test_dropout_default_mask",
+      // // "test_dropout_default_old",
+      // // "test_dropout_default_ratio",
+      // // "test_dropout_default",
+      // // "test_dropout_random_old",
+      // // "test_dropout_random",
+      // // "test_dynamic_slice_default_axes",
+      // // "test_dynamic_slice_end_out_of_bounds",
+      // // "test_dynamic_slice_neg",
+      // // "test_dynamic_slice_start_out_of_bounds",
+      // // "test_dynamic_slice",
+      // // "test_dynamicquantizelinear_expanded",
+      // // "test_dynamicquantizelinear_max_adjusted_expanded",
+      // // "test_dynamicquantizelinear_max_adjusted",
+      // // "test_dynamicquantizelinear_min_adjusted_expanded",
+      // // "test_dynamicquantizelinear_min_adjusted",
+      // // "test_dynamicquantizelinear",
+      // "test_edge_pad",
+      // "test_einsum_batch_diagonal",
+      // "test_einsum_batch_matmul",
+      // "test_einsum_inner_prod",
+      // "test_einsum_sum",
+      // "test_einsum_transpose",
+      "test_elu_default",
+      "test_elu_example",
+      "test_elu",
+      // "test_equal_bcast",
+      // "test_equal",
+      // "test_erf",
+      "test_exp_example",
+      "test_exp",
+      // "test_expand_dim_changed",
+      // "test_expand_dim_unchanged",
+      // "test_eyelike_populate_off_main_diagonal",
+      // "test_eyelike_with_dtype",
+      // "test_eyelike_without_dtype",
+      "test_flatten_axis0",
+      "test_flatten_axis1",
+      "test_flatten_axis2",
+      "test_flatten_axis3",
+      "test_flatten_default_axis",
+      "test_flatten_negative_axis1",
+      "test_flatten_negative_axis2",
+      "test_flatten_negative_axis3",
+      "test_flatten_negative_axis4",
+      "test_floor_example",
+      "test_floor",
+      // "test_gather_0",
+      // "test_gather_1",
+      // "test_gather_2d_indices",
+      // "test_gather_negative_indices",
+      // "test_gather_elements_0",
+      // "test_gather_elements_1",
+      // "test_gather_elements_negative_indices",
+      // "test_gather_negative_indices",
+      // "test_gathernd_example_float32",
+      // "test_gathernd_example_int32_batch_dim1",
+      // "test_gathernd_example_int32",
+      "test_gemm_all_attributes",
+      "test_gemm_alpha",
+      "test_gemm_beta",
       "test_gemm_broadcast",
-      "test_matmul_2d",
-      "test_matmul_3d",
-      "test_matmul_4d",
-      "test_softmax_axis_0",
-      "test_softmax_axis_1",
-      "test_softmax_axis_2",
-      "test_softmax_default_axis",
-      "test_softmax_example",
-      "test_softmax_large_number",
-      "test_sum_example",
-      "test_sum_one_input",
-      "test_sum_two_inputs",
-      "test_averagepool_1d_default",
-      "test_averagepool_2d_default",
-      "test_averagepool_2d_pads",
-      "test_averagepool_2d_precomputed_pads",
-      "test_averagepool_2d_precomputed_same_upper",
-      "test_averagepool_2d_precomputed_strides",
-      "test_averagepool_2d_same_upper",
-      "test_averagepool_2d_same_lower",
-      "test_averagepool_2d_strides",
-      "test_averagepool_3d_default",
-      "test_maxpool_1d_default",
+      "test_gemm_default_matrix_bias",
+      "test_gemm_default_no_bias",
+      // "test_gemm_default_scalar_bias",
+      "test_gemm_default_single_elem_vector_bias",
+      "test_gemm_default_vector_bias",
+      "test_gemm_default_zero_bias",
+      "test_gemm_nobroadcast",
+      "test_gemm_transposeA",
+      "test_gemm_transposeB",
+      "test_globalaveragepool_precomputed",
+      "test_globalaveragepool",
+      // "test_globalmaxpool_precomputed",
+      // "test_globalmaxpool",
+      // "test_greater_bcast",
+      // "test_greater_equal_bcast_expanded",
+      // "test_greater_equal_bcast",
+      // "test_greater_equal_expanded",
+      // "test_greater_equal",
+      // "test_greater",
+      // // "test_gridsample_aligncorners_true",
+      // // "test_gridsample_bicubic",
+      // // "test_gridsample_bilinear",
+      // // "test_gridsample_border_padding",
+      // // "test_gridsample_nearest",
+      // // "test_gridsample_reflection_padding",
+      // // "test_gridsample_zeros_padding",
+      // // "test_gridsample",
+      // // "test_gru_batchwise",
+      // // "test_gru_defaults",
+      // // "test_gru_seq_length",
+      // // "test_gru_with_initial_bias",
+      // // "test_hammingwindow_expanded",
+      // // "test_hammingwindow_symmetric_expanded",
+      // // "test_hammingwindow_symmetric",
+      // // "test_hammingwindow",
+      // // "test_hannwindow_expanded",
+      // // "test_hannwindow_symmetric_expanded",
+      // // "test_hannwindow_symmetric",
+      // // "test_hannwindow",
+      // // "test_hardmax_axis_0",
+      // // "test_hardmax_axis_1",
+      // // "test_hardmax_axis_2",
+      // // "test_hardmax_default_axis",
+      // // "test_hardmax_example",
+      // // "test_hardmax_negative_axis",
+      // // "test_hardmax_one_hot",
+      // // "test_hardsigmoid_default",
+      // // "test_hardsigmoid_example",
+      // // "test_hardsigmoid",
+      // "test_hardswish_expanded",
+      "test_hardswish",
+      // "test_if",
+      // TODO: Uncomment 'test_if_seq' and 'test_if_opt' once the test infra
+      // supports Sequence and Optional types
+      // "test_if_seq",
+      // "test_if_opt",
+      "test_instancenorm_epsilon",
+      // "test_instancenorm_example",
+      // "test_isinf_negative",
+      // "test_isinf_positive",
+      // "test_isinf",
+      // "test_isnan",
+      // "test_layer_normalization_2d_axis_negative_1_expanded",
+      // "test_layer_normalization_2d_axis_negative_1",
+      // "test_layer_normalization_2d_axis_negative_2_expanded",
+      // "test_layer_normalization_2d_axis_negative_2",
+      // "test_layer_normalization_2d_axis0_expanded",
+      // "test_layer_normalization_2d_axis0",
+      // "test_layer_normalization_2d_axis1_expanded",
+      // "test_layer_normalization_2d_axis1",
+      // // "test_layer_normalization_3d_axis_negative_1_epsilon_expanded",
+      // "test_layer_normalization_3d_axis_negative_1_epsilon",
+      // // "test_layer_normalization_3d_axis_negative_2_epsilon_expanded",
+      // "test_layer_normalization_3d_axis_negative_2_epsilon",
+      // // "test_layer_normalization_3d_axis_negative_3_epsilon_expanded",
+      // "test_layer_normalization_3d_axis_negative_3_epsilon",
+      // // "test_layer_normalization_3d_axis0_epsilon_expanded",
+      // "test_layer_normalization_3d_axis0_epsilon",
+      // "test_layer_normalization_3d_axis1_epsilon_expanded",
+      // "test_layer_normalization_3d_axis1_epsilon",
+      // // "test_layer_normalization_3d_axis2_epsilon_expanded",
+      // "test_layer_normalization_3d_axis2_epsilon",
+      // "test_layer_normalization_4d_axis_negative_1_expanded",
+      // "test_layer_normalization_4d_axis_negative_1",
+      // // "test_layer_normalization_4d_axis_negative_2_expanded",
+      // "test_layer_normalization_4d_axis_negative_2",
+      // "test_layer_normalization_4d_axis_negative_3_expanded",
+      // "test_layer_normalization_4d_axis_negative_3",
+      // "test_layer_normalization_4d_axis_negative_4_expanded",
+      // "test_layer_normalization_4d_axis_negative_4",
+      // "test_layer_normalization_4d_axis0_expanded",
+      // "test_layer_normalization_4d_axis0",
+      // "test_layer_normalization_4d_axis1_expanded",
+      // "test_layer_normalization_4d_axis1",
+      // "test_layer_normalization_4d_axis2_expanded",
+      // "test_layer_normalization_4d_axis2",
+      // "test_layer_normalization_4d_axis3_expanded",
+      // "test_layer_normalization_4d_axis3",
+      // "test_layer_normalization_default_axis_expanded",
+      // "test_layer_normalization_default_axis",
+      "test_leakyrelu_default",
+      "test_leakyrelu_example",
+      "test_leakyrelu",
+      // "test_less_bcast",
+      // "test_less_equal_bcast_expanded",
+      // "test_less_equal_bcast",
+      // "test_less_equal_expanded",
+      // "test_less_equal",
+      // "test_less",
+      "test_log_example",
+      "test_log",
+      // // "test_logsoftmax_axis_0_expanded",
+      // // "test_logsoftmax_axis_0",
+      // // "test_logsoftmax_axis_1_expanded",
+      // // "test_logsoftmax_axis_1",
+      // // "test_logsoftmax_axis_2_expanded",
+      // // "test_logsoftmax_axis_2",
+      // // "test_logsoftmax_default_axis_expanded",
+      // // "test_logsoftmax_default_axis",
+      // // "test_logsoftmax_example_1_expanded",
+      // // "test_logsoftmax_example_1",
+      // // "test_logsoftmax_large_number_expanded",
+      // // "test_logsoftmax_large_number",
+      // // "test_logsoftmax_negative_axis_expanded",
+      // // "test_logsoftmax_negative_axis",
+      // "test_lrn_default",
+      // "test_lrn",
+      // // "test_lstm_batchwise",
+      // // "test_lstm_defaults",
+      // // "test_lstm_with_initial_bias",
+      // // "test_lstm_with_peepholes",
+      // "test_matmul_2d",
+      // "test_matmul_3d",
+      // "test_matmul_4d",
+      // // "test_matmulinteger",
+      // "test_max_example",
+      // "test_max_float16",
+      // "test_max_float32",
+      // "test_max_float64",
+      // "test_max_int16",
+      // "test_max_int32",
+      // "test_max_int64",
+      // "test_max_int8",
+      // "test_max_one_input",
+      // "test_max_two_inputs",
+      // "test_max_uint16",
+      // "test_max_uint32",
+      // "test_max_uint64",
+      // "test_max_uint8",
+      // "test_maxpool_1d_default",
+      // "test_maxpool_2d_ceil",
       "test_maxpool_2d_default",
+      // "test_maxpool_2d_dilations",
       "test_maxpool_2d_pads",
       "test_maxpool_2d_precomputed_pads",
       "test_maxpool_2d_precomputed_same_upper",
@@ -1587,13 +1926,622 @@
       "test_maxpool_2d_same_lower",
       "test_maxpool_2d_same_upper",
       "test_maxpool_2d_strides",
-      "test_maxpool_3d_default",
-      "test_globalaveragepool_precomputed",
-      "test_globalaveragepool",
-      "test_globalmaxpool_precomputed",
-      "test_globalmaxpool",
-      "test_instancenorm_epsilon",
-      "test_instancenorm_example"
+      // "test_maxpool_2d_uint8",
+      // "test_maxpool_3d_default",
+      // "test_maxpool_with_argmax_2d_precomputed_pads",
+      // "test_maxpool_with_argmax_2d_precomputed_strides",
+      // // "test_maxunpool_export_with_output_shape",
+      // // "test_maxunpool_export_without_output_shape",
+      // // "test_mean_example",
+      // // "test_mean_one_input",
+      // // "test_mean_two_inputs",
+      // // "test_melweightmatrix",
+      // "test_min_example",
+      // "test_min_float16",
+      // "test_min_float32",
+      // "test_min_float64",
+      // "test_min_int16",
+      // "test_min_int32",
+      // "test_min_int64",
+      // "test_min_int8",
+      // "test_min_one_input",
+      // "test_min_two_inputs",
+      // "test_min_uint16",
+      // "test_min_uint32",
+      // "test_min_uint64",
+      // "test_min_uint8",
+      // "test_mod_bcast",
+      // "test_mod_broadcast",
+      // "test_mod_float_mixed_sign_example",
+      // "test_mod_fmod_mixed_sign_example",
+      // "test_mod_int64_fmod",
+      // "test_mod_int64_mixed_sign_example",
+      // "test_mod_mixed_sign_float16",
+      // "test_mod_mixed_sign_float32",
+      // "test_mod_mixed_sign_float64",
+      // "test_mod_mixed_sign_int16",
+      // "test_mod_mixed_sign_int32",
+      // "test_mod_mixed_sign_int64",
+      // "test_mod_mixed_sign_int8",
+      // "test_mod_uint16",
+      // "test_mod_uint32",
+      // "test_mod_uint64",
+      // "test_mod_uint8",
+      // // "test_momentum_multiple",
+      // // "test_momentum",
+      "test_mul_bcast",
+      "test_mul_example",
+      // "test_mul_uint8",
+      "test_mul",
+      // "test_mvn_expanded",
+      // "test_mvn",
+      "test_neg_example",
+      "test_neg",
+      // // "test_negative_log_likelihood_loss_iinput_shape_is_NCd1_weight_ignore_index_expanded",
+      // // "test_negative_log_likelihood_loss_iinput_shape_is_NCd1_weight_ignore_index",
+      // // "test_negative_log_likelihood_loss_input_shape_is_NC_expanded",
+      // // "test_negative_log_likelihood_loss_input_shape_is_NC",
+      // // "test_negative_log_likelihood_loss_input_shape_is_NCd1_expanded",
+      // // "test_negative_log_likelihood_loss_input_shape_is_NCd1_ignore_index_expanded",
+      // // "test_negative_log_likelihood_loss_input_shape_is_NCd1_ignore_index",
+      // // "test_negative_log_likelihood_loss_input_shape_is_NCd1_mean_weight_negative_ignore_index_expanded",
+      // // "test_negative_log_likelihood_loss_input_shape_is_NCd1_mean_weight_negative_ignore_index",
+      // // "test_negative_log_likelihood_loss_input_shape_is_NCd1_weight_expanded",
+      // // "test_negative_log_likelihood_loss_input_shape_is_NCd1_weight",
+      // // "test_negative_log_likelihood_loss_input_shape_is_NCd1",
+      // // "test_negative_log_likelihood_loss_input_shape_is_NCd1d2_expanded",
+      // // "test_negative_log_likelihood_loss_input_shape_is_NCd1d2_no_weight_reduction_mean_ignore_index_expanded",
+      // // "test_negative_log_likelihood_loss_input_shape_is_NCd1d2_no_weight_reduction_mean_ignore_index",
+      // // "test_negative_log_likelihood_loss_input_shape_is_NCd1d2_reduction_mean_expanded",
+      // // "test_negative_log_likelihood_loss_input_shape_is_NCd1d2_reduction_mean",
+      // // "test_negative_log_likelihood_loss_input_shape_is_NCd1d2_reduction_sum_expanded",
+      // // "test_negative_log_likelihood_loss_input_shape_is_NCd1d2_reduction_sum",
+      // // "test_negative_log_likelihood_loss_input_shape_is_NCd1d2_with_weight_expanded",
+      // // "test_negative_log_likelihood_loss_input_shape_is_NCd1d2_with_weight_reduction_mean_expanded",
+      // // "test_negative_log_likelihood_loss_input_shape_is_NCd1d2_with_weight_reduction_mean",
+      // // "test_negative_log_likelihood_loss_input_shape_is_NCd1d2_with_weight_reduction_sum_expanded",
+      // // "test_negative_log_likelihood_loss_input_shape_is_NCd1d2_with_weight_reduction_sum_ignore_index_expanded",
+      // // "test_negative_log_likelihood_loss_input_shape_is_NCd1d2_with_weight_reduction_sum_ignore_index",
+      // // "test_negative_log_likelihood_loss_input_shape_is_NCd1d2_with_weight_reduction_sum",
+      // // "test_negative_log_likelihood_loss_input_shape_is_NCd1d2_with_weight",
+      // // "test_negative_log_likelihood_loss_input_shape_is_NCd1d2",
+      // // "test_negative_log_likelihood_loss_input_shape_is_NCd1d2d3_none_no_weight_negative_ignore_index_expanded",
+      // // "test_negative_log_likelihood_loss_input_shape_is_NCd1d2d3_none_no_weight_negative_ignore_index",
+      // // "test_negative_log_likelihood_loss_input_shape_is_NCd1d2d3_sum_weight_high_ignore_index_expanded",
+      // // "test_negative_log_likelihood_loss_input_shape_is_NCd1d2d3_sum_weight_high_ignore_index",
+      // // "test_negative_log_likelihood_loss_input_shape_is_NCd1d2d3d4d5_mean_weight_expanded",
+      // // "test_negative_log_likelihood_loss_input_shape_is_NCd1d2d3d4d5_mean_weight",
+      // // "test_negative_log_likelihood_loss_input_shape_is_NCd1d2d3d4d5_none_no_weight_expanded",
+      // // "test_negative_log_likelihood_loss_input_shape_is_NCd1d2d3d4d5_none_no_weight",
+      // // "test_nesterov_momentum",
+      // // "test_nllloss_NC_expanded",
+      // // "test_nllloss_NC",
+      // // "test_nllloss_NCd1_expanded",
+      // // "test_nllloss_NCd1_ii_expanded",
+      // // "test_nllloss_NCd1_ii",
+      // // "test_nllloss_NCd1_mean_weight_negative_ii_expanded",
+      // // "test_nllloss_NCd1_mean_weight_negative_ii",
+      // // "test_nllloss_NCd1_weight_expanded",
+      // // "test_nllloss_NCd1_weight_ii_expanded",
+      // // "test_nllloss_NCd1_weight_ii",
+      // // "test_nllloss_NCd1_weight",
+      // // "test_nllloss_NCd1",
+      // // "test_nllloss_NCd1d2_expanded",
+      // // "test_nllloss_NCd1d2_no_weight_reduction_mean_ii_expanded",
+      // // "test_nllloss_NCd1d2_no_weight_reduction_mean_ii",
+      // // "test_nllloss_NCd1d2_reduction_mean_expanded",
+      // // "test_nllloss_NCd1d2_reduction_mean",
+      // // "test_nllloss_NCd1d2_reduction_sum_expanded",
+      // // "test_nllloss_NCd1d2_reduction_sum",
+      // // "test_nllloss_NCd1d2_with_weight_expanded",
+      // // "test_nllloss_NCd1d2_with_weight_reduction_mean_expanded",
+      // // "test_nllloss_NCd1d2_with_weight_reduction_mean",
+      // // "test_nllloss_NCd1d2_with_weight_reduction_sum_expanded",
+      // // "test_nllloss_NCd1d2_with_weight_reduction_sum_ii_expanded",
+      // // "test_nllloss_NCd1d2_with_weight_reduction_sum_ii",
+      // // "test_nllloss_NCd1d2_with_weight_reduction_sum",
+      // // "test_nllloss_NCd1d2_with_weight",
+      // // "test_nllloss_NCd1d2",
+      // // "test_nllloss_NCd1d2d3_none_no_weight_negative_ii_expanded",
+      // // "test_nllloss_NCd1d2d3_none_no_weight_negative_ii",
+      // // "test_nllloss_NCd1d2d3_sum_weight_high_ii_expanded",
+      // // "test_nllloss_NCd1d2d3_sum_weight_high_ii",
+      // // "test_nllloss_NCd1d2d3d4d5_mean_weight_expanded",
+      // // "test_nllloss_NCd1d2d3d4d5_mean_weight",
+      // // "test_nllloss_NCd1d2d3d4d5_none_no_weight_expanded",
+      // // "test_nllloss_NCd1d2d3d4d5_none_no_weight",
+      // "test_nonmaxsuppression_center_point_box_format",
+      // "test_nonmaxsuppression_flipped_coordinates",
+      // "test_nonmaxsuppression_identical_boxes",
+      // "test_nonmaxsuppression_limit_output_size",
+      // "test_nonmaxsuppression_single_box",
+      // "test_nonmaxsuppression_suppress_by_IOU_and_scores",
+      // "test_nonmaxsuppression_suppress_by_IOU",
+      // "test_nonmaxsuppression_two_batches",
+      // "test_nonmaxsuppression_two_classes",
+      // "test_nonzero_example",
+      // "test_not_2d",
+      // "test_not_3d",
+      // "test_not_4d",
+      // // "test_onehot_negative_indices",
+      // // "test_onehot_with_axis",
+      // // "test_onehot_with_negative_axis",
+      // // "test_onehot_without_axis",
+      // // "test_optional_get_element_sequence",
+      // // "test_optional_get_element",
+      // // "test_optional_has_element_empty",
+      // // "test_optional_has_element",
+      // "test_or_bcast3v1d",
+      // "test_or_bcast3v2d",
+      // "test_or_bcast4v2d",
+      // "test_or_bcast4v3d",
+      // "test_or_bcast4v4d",
+      // "test_or2d",
+      // "test_or3d",
+      // "test_or4d",
+      "test_pow_bcast_array",
+      "test_pow_bcast_scalar",
+      "test_pow_example",
+      // "test_pow_types_float",
+      // "test_pow_types_float32_int32",
+      // "test_pow_types_float32_int64",
+      // "test_pow_types_float32_uint32",
+      // "test_pow_types_float32_uint64",
+      // "test_pow_types_int",
+      // "test_pow_types_int32_float32",
+      // "test_pow_types_int32_int32",
+      // "test_pow_types_int64_float32",
+      // "test_pow_types_int64_int64",
+      "test_pow",
+      // "test_prelu_broadcast",
+      // "test_prelu_example",
+      // // "test_qlinearconv",
+      // // "test_qlinearmatmul_2D",
+      // // "test_qlinearmatmul_3D",
+      // // "test_quantizelinear_axis",
+      // // "test_quantizelinear",
+      // "test_range_float_type_positive_delta_expanded",
+      // "test_range_float_type_positive_delta",
+      // "test_range_int32_type_negative_delta_expanded",
+      // "test_range_int32_type_negative_delta",
+      // "test_reciprocal_example",
+      // "test_reciprocal",
+      // "test_reduce_l1_default_axes_keepdims_example",
+      // "test_reduce_l1_default_axes_keepdims_random",
+      // "test_reduce_l1_do_not_keepdims_example",
+      // "test_reduce_l1_do_not_keepdims_random",
+      // "test_reduce_l1_keep_dims_example",
+      // "test_reduce_l1_keep_dims_random",
+      // "test_reduce_l1_negative_axes_keep_dims_example",
+      // "test_reduce_l1_negative_axes_keep_dims_random",
+      // "test_reduce_l2_default_axes_keepdims_example",
+      // "test_reduce_l2_default_axes_keepdims_random",
+      // "test_reduce_l2_do_not_keepdims_example",
+      // "test_reduce_l2_do_not_keepdims_random",
+      // "test_reduce_l2_keep_dims_example",
+      // "test_reduce_l2_keep_dims_random",
+      // "test_reduce_l2_negative_axes_keep_dims_example",
+      // "test_reduce_l2_negative_axes_keep_dims_random",
+      // "test_reduce_log_sum_asc_axes",
+      // "test_reduce_log_sum_default",
+      // "test_reduce_log_sum_desc_axes",
+      // tests "test_reduce_log_sum_exp_*" on opset17/opset18 are excluded because they use float64.
+      // "opset{7,8,9}/test_reduce_log_sum_exp_default_axes_keepdims_example",
+      // "opset{7,8,9}/test_reduce_log_sum_exp_default_axes_keepdims_random",
+      // "opset{7,8,9}/test_reduce_log_sum_exp_do_not_keepdims_example",
+      // "opset{7,8,9}/test_reduce_log_sum_exp_do_not_keepdims_random",
+      // "opset{7,8,9}/test_reduce_log_sum_exp_keepdims_example",
+      // "opset{7,8,9}/test_reduce_log_sum_exp_keepdims_random",
+      // "opset11/test_reduce_log_sum_exp_negative_axes_keepdims_example",
+      // "opset11/test_reduce_log_sum_exp_negative_axes_keepdims_random",
+      // "test_reduce_log_sum_negative_axes",
+      // "test_reduce_log_sum",
+      // "test_reduce_max_default_axes_keepdim_example",
+      // "test_reduce_max_default_axes_keepdims_random",
+      // "test_reduce_max_do_not_keepdims_example",
+      // "test_reduce_max_do_not_keepdims_random",
+      // "test_reduce_max_keepdims_example",
+      // "test_reduce_max_keepdims_random",
+      // "test_reduce_max_negative_axes_keepdims_example",
+      // "test_reduce_max_negative_axes_keepdims_random",
+      // "test_reduce_mean_default_axes_keepdims_example",
+      // "test_reduce_mean_default_axes_keepdims_random",
+      // "test_reduce_mean_do_not_keepdims_example",
+      // "test_reduce_mean_do_not_keepdims_random",
+      // "test_reduce_mean_keepdims_example",
+      // "test_reduce_mean_keepdims_random",
+      // "test_reduce_mean_negative_axes_keepdims_example",
+      // "test_reduce_mean_negative_axes_keepdims_random",
+      // "test_reduce_min_default_axes_keepdims_example",
+      // "test_reduce_min_default_axes_keepdims_random",
+      // "test_reduce_min_do_not_keepdims_example",
+      // "test_reduce_min_do_not_keepdims_random",
+      // "test_reduce_min_keepdims_example",
+      // "test_reduce_min_keepdims_random",
+      // "test_reduce_min_negative_axes_keepdims_example",
+      // "test_reduce_min_negative_axes_keepdims_random",
+      // "test_reduce_prod_default_axes_keepdims_example",
+      // "test_reduce_prod_default_axes_keepdims_random",
+      // "test_reduce_prod_do_not_keepdims_example",
+      // "test_reduce_prod_do_not_keepdims_random",
+      // "test_reduce_prod_keepdims_example",
+      // "test_reduce_prod_keepdims_random",
+      // "test_reduce_prod_negative_axes_keepdims_example",
+      // "test_reduce_prod_negative_axes_keepdims_random",
+      // "test_reduce_sum_default_axes_keepdims_example",
+      // "test_reduce_sum_default_axes_keepdims_random",
+      // "test_reduce_sum_do_not_keepdims_example",
+      // "test_reduce_sum_do_not_keepdims_random",
+      // "test_reduce_sum_empty_axes_input_noop_example",
+      // "test_reduce_sum_empty_axes_input_noop_random",
+      // "test_reduce_sum_keepdims_example",
+      // "test_reduce_sum_keepdims_random",
+      // "test_reduce_sum_negative_axes_keepdims_example",
+      // "test_reduce_sum_negative_axes_keepdims_random",
+      // "test_reduce_sum_square_default_axes_keepdims_example",
+      // "test_reduce_sum_square_default_axes_keepdims_random",
+      // "test_reduce_sum_square_do_not_keepdims_example",
+      // "test_reduce_sum_square_do_not_keepdims_random",
+      // "test_reduce_sum_square_keepdims_example",
+      // "test_reduce_sum_square_keepdims_random",
+      // "test_reduce_sum_square_negative_axes_keepdims_example",
+      // "test_reduce_sum_square_negative_axes_keepdims_random",
+      // "test_reflect_pad",
+      "test_relu",
+      // "test_reshape_allowzero_reordered",
+      // "test_reshape_extended_dims",
+      // "test_reshape_negative_dim",
+      // "test_reshape_negative_extended_dims",
+      // "test_reshape_one_dim",
+      // "test_reshape_reduced_dims",
+      // "test_reshape_reordered_all_dims",
+      // "test_reshape_reordered_dims",
+      // "test_reshape_reordered_last_dims",
+      // "test_reshape_zero_and_negative_dim",
+      // "test_reshape_zero_dim",
+      // "test_resize_downsample_linear",
+      // "test_resize_downsample_nearest",
+      // "test_resize_downsample_scales_cubic_A_n0p5_exclude_outside",
+      // "test_resize_downsample_scales_cubic_align_corners",
+      // "test_resize_downsample_scales_cubic",
+      // "test_resize_downsample_scales_linear_align_corners",
+      // "test_resize_downsample_scales_linear",
+      // "test_resize_downsample_scales_nearest",
+      // "test_resize_downsample_sizes_cubic",
+      // "test_resize_downsample_sizes_linear_pytorch_half_pixel",
+      // "test_resize_downsample_sizes_nearest_tf_half_pixel_for_nn",
+      // "test_resize_downsample_sizes_nearest",
+      // "test_resize_nearest",
+      // "test_resize_tf_crop_and_resize",
+      // "test_resize_upsample_linear",
+      // "test_resize_upsample_nearest",
+      // "test_resize_upsample_scales_cubic_A_n0p5_exclude_outside",
+      // "test_resize_upsample_scales_cubic_align_corners",
+      // "test_resize_upsample_scales_cubic_asymmetric",
+      // "test_resize_upsample_scales_cubic",
+      // "test_resize_upsample_scales_linear_align_corners",
+      // "test_resize_upsample_scales_linear",
+      // "test_resize_upsample_scales_nearest",
+      // "test_resize_upsample_sizes_cubic",
+      // "opset{12,13,17,18}/test_resize_upsample_sizes_nearest_ceil_half_pixel",
+      // "opset{12,13,17,18}/test_resize_upsample_sizes_nearest_floor_align_corners",
+      // "opset{12,13,17,18}/test_resize_upsample_sizes_nearest_round_prefer_ceil_asymmetric",
+      // "test_resize_upsample_sizes_nearest",
+      // // "test_reversesequence_batch",
+      // // "test_reversesequence_time",
+      // // "test_rnn_seq_length",
+      // // "test_roialign_aligned_false",
+      // // "test_roialign_aligned_true",
+      // // "test_roialign",
+      // // "test_round",
+      // // "test_scan_sum",
+      // // "test_scan9_sum",
+      // // "test_scatter_elements_with_axis",
+      // // "test_scatter_elements_with_duplicate_indices",
+      // // "test_scatter_elements_with_negative_indices",
+      // // "test_scatter_elements_without_axis",
+      // // "test_scatter_with_axis",
+      // // "test_scatter_without_axis",
+      // // "test_scatternd_add",
+      // // "test_scatternd_multiply",
+      // // "test_scatternd",
+      // // "test_sce_mean_3d_expanded",
+      // // "test_sce_mean_3d_log_prob_expanded",
+      // // "test_sce_mean_3d_log_prob",
+      // // "test_sce_mean_3d",
+      // // "test_sce_mean_expanded",
+      // // "test_sce_mean_log_prob_expanded",
+      // // "test_sce_mean_log_prob",
+      // // "test_sce_mean_no_weight_ii_3d_expanded",
+      // // "test_sce_mean_no_weight_ii_3d_log_prob_expanded",
+      // // "test_sce_mean_no_weight_ii_3d_log_prob",
+      // // "test_sce_mean_no_weight_ii_3d",
+      // // "test_sce_mean_no_weight_ii_4d_expanded",
+      // // "test_sce_mean_no_weight_ii_4d_log_prob_expanded",
+      // // "test_sce_mean_no_weight_ii_4d_log_prob",
+      // // "test_sce_mean_no_weight_ii_4d",
+      // // "test_sce_mean_no_weight_ii_expanded",
+      // // "test_sce_mean_no_weight_ii_log_prob_expanded",
+      // // "test_sce_mean_no_weight_ii_log_prob",
+      // // "test_sce_mean_no_weight_ii",
+      // // "test_sce_mean_weight_expanded",
+      // // "test_sce_mean_weight_ii_3d_expanded",
+      // // "test_sce_mean_weight_ii_3d_log_prob_expanded",
+      // // "test_sce_mean_weight_ii_3d_log_prob",
+      // // "test_sce_mean_weight_ii_3d",
+      // // "test_sce_mean_weight_ii_4d_expanded",
+      // // "test_sce_mean_weight_ii_4d_log_prob_expanded",
+      // // "test_sce_mean_weight_ii_4d_log_prob",
+      // // "test_sce_mean_weight_ii_4d",
+      // // "test_sce_mean_weight_ii_expanded",
+      // // "test_sce_mean_weight_ii_log_prob_expanded",
+      // // "test_sce_mean_weight_ii_log_prob",
+      // // "test_sce_mean_weight_ii",
+      // // "test_sce_mean_weight_log_prob_expanded",
+      // // "test_sce_mean_weight_log_prob",
+      // // "test_sce_mean_weight",
+      // // "test_sce_mean",
+      // // "test_sce_NCd1_mean_weight_negative_ii_expanded",
+      // // "test_sce_NCd1_mean_weight_negative_ii_log_prob_expanded",
+      // // "test_sce_NCd1_mean_weight_negative_ii_log_prob",
+      // // "test_sce_NCd1_mean_weight_negative_ii",
+      // // "test_sce_NCd1d2d3_none_no_weight_negative_ii_expanded",
+      // // "test_sce_NCd1d2d3_none_no_weight_negative_ii_log_prob_expanded",
+      // // "test_sce_NCd1d2d3_none_no_weight_negative_ii_log_prob",
+      // // "test_sce_NCd1d2d3_none_no_weight_negative_ii",
+      // // "test_sce_NCd1d2d3_sum_weight_high_ii_expanded",
+      // // "test_sce_NCd1d2d3_sum_weight_high_ii_log_prob_expanded",
+      // // "test_sce_NCd1d2d3_sum_weight_high_ii_log_prob",
+      // // "test_sce_NCd1d2d3_sum_weight_high_ii",
+      // // "test_sce_NCd1d2d3d4d5_mean_weight_expanded",
+      // // "test_sce_NCd1d2d3d4d5_mean_weight_log_prob_expanded",
+      // // "test_sce_NCd1d2d3d4d5_mean_weight_log_prob",
+      // // "test_sce_NCd1d2d3d4d5_mean_weight",
+      // // "test_sce_NCd1d2d3d4d5_none_no_weight_expanded",
+      // // "test_sce_NCd1d2d3d4d5_none_no_weight_log_prob_expanded",
+      // // "test_sce_NCd1d2d3d4d5_none_no_weight_log_prob",
+      // // "test_sce_NCd1d2d3d4d5_none_no_weight",
+      // // "test_sce_none_expanded",
+      // // "test_sce_none_log_prob_expanded",
+      // // "test_sce_none_log_prob",
+      // // "test_sce_none_weights_expanded",
+      // // "test_sce_none_weights_log_prob_expanded",
+      // // "test_sce_none_weights_log_prob",
+      // // "test_sce_none_weights",
+      // // "test_sce_none",
+      // // "test_sce_sum_expanded",
+      // // "test_sce_sum_log_prob_expanded",
+      // // "test_sce_sum_log_prob",
+      // // "test_sce_sum",
+      // "test_selu_default",
+      // "test_selu_example",
+      // "test_selu",
+      // // "test_sequence_insert_at_back",
+      // // "test_sequence_insert_at_front",
+      // // "test_sequence_map_add_1_sequence_1_tensor_expanded",
+      // // "test_sequence_map_add_1_sequence_1_tensor",
+      // // "test_sequence_map_add_2_sequences_expanded",
+      // // "test_sequence_map_add_2_sequences",
+      // // "test_sequence_map_extract_shapes_expanded",
+      // // "test_sequence_map_extract_shapes",
+      // // "test_sequence_map_identity_1_sequence_1_tensor_expanded",
+      // // "test_sequence_map_identity_1_sequence_1_tensor",
+      // // "test_sequence_map_identity_1_sequence_expanded",
+      // // "test_sequence_map_identity_1_sequence",
+      // // "test_sequence_map_identity_2_sequences_expanded",
+      // // "test_sequence_map_identity_2_sequences",
+      // "test_shrink_hard",
+      // "test_shrink_soft",
+      "test_sigmoid_example",
+      "test_sigmoid",
+      // "test_sign",
+      // "test_simple_rnn_batchwise",
+      // "test_simple_rnn_defaults",
+      // "test_simple_rnn_with_initial_bias",
+      "test_sin_example",
+      "test_sin",
+      // "test_sinh_example",
+      // "test_sinh",
+      // // "test_size_example",
+      // // "test_size",
+      // "test_slice_default_axes",
+      // "test_slice_default_steps",
+      // "test_slice_end_out_of_bounds",
+      // "test_slice_neg_steps",
+      // "test_slice_neg",
+      // "test_slice_negative_axes",
+      // "test_slice_start_out_of_bounds",
+      // "test_slice",
+      // "test_softmax_axis_0_expanded",
+      "test_softmax_axis_0",
+      // "test_softmax_axis_1_expanded",
+      "test_softmax_axis_1",
+      // "test_softmax_axis_2_expanded",
+      "test_softmax_axis_2",
+      // "test_softmax_cross_entropy_input_shape_is_NCd1_mean_weight_negative_ignore_index_expanded",
+      // "test_softmax_cross_entropy_input_shape_is_NCd1_mean_weight_negative_ignore_index_log_prob_expanded",
+      // "test_softmax_cross_entropy_input_shape_is_NCd1_mean_weight_negative_ignore_index_log_prob",
+      // "test_softmax_cross_entropy_input_shape_is_NCd1_mean_weight_negative_ignore_index",
+      // "test_softmax_cross_entropy_input_shape_is_NCd1d2d3_none_no_weight_negative_ignore_index_expanded",
+      // "test_softmax_cross_entropy_input_shape_is_NCd1d2d3_none_no_weight_negative_ignore_index_log_prob_expanded",
+      // "test_softmax_cross_entropy_input_shape_is_NCd1d2d3_none_no_weight_negative_ignore_index_log_prob",
+      // "test_softmax_cross_entropy_input_shape_is_NCd1d2d3_none_no_weight_negative_ignore_index",
+      // "test_softmax_cross_entropy_input_shape_is_NCd1d2d3_sum_weight_high_ignore_index_expanded",
+      // "test_softmax_cross_entropy_input_shape_is_NCd1d2d3_sum_weight_high_ignore_index_log_prob_expanded",
+      // "test_softmax_cross_entropy_input_shape_is_NCd1d2d3_sum_weight_high_ignore_index_log_prob",
+      // "test_softmax_cross_entropy_input_shape_is_NCd1d2d3_sum_weight_high_ignore_index",
+      // "test_softmax_cross_entropy_input_shape_is_NCd1d2d3d4d5_mean_weight_expanded",
+      // "test_softmax_cross_entropy_input_shape_is_NCd1d2d3d4d5_mean_weight_log_prob_expanded",
+      // "test_softmax_cross_entropy_input_shape_is_NCd1d2d3d4d5_mean_weight_log_prob",
+      // "test_softmax_cross_entropy_input_shape_is_NCd1d2d3d4d5_mean_weight",
+      // "test_softmax_cross_entropy_input_shape_is_NCd1d2d3d4d5_none_no_weight_expanded",
+      // "test_softmax_cross_entropy_input_shape_is_NCd1d2d3d4d5_none_no_weight_log_prob_expanded",
+      // "test_softmax_cross_entropy_input_shape_is_NCd1d2d3d4d5_none_no_weight_log_prob",
+      // "test_softmax_cross_entropy_input_shape_is_NCd1d2d3d4d5_none_no_weight",
+      // "test_softmax_cross_entropy_mean_3d_expanded",
+      // "test_softmax_cross_entropy_mean_3d_log_prob_expanded",
+      // "test_softmax_cross_entropy_mean_3d_log_prob",
+      // "test_softmax_cross_entropy_mean_3d",
+      // "test_softmax_cross_entropy_mean_expanded",
+      // "test_softmax_cross_entropy_mean_log_prob_expanded",
+      // "test_softmax_cross_entropy_mean_log_prob",
+      // "test_softmax_cross_entropy_mean_no_weight_ignore_index_3d_expanded",
+      // "test_softmax_cross_entropy_mean_no_weight_ignore_index_3d_log_prob_expanded",
+      // "test_softmax_cross_entropy_mean_no_weight_ignore_index_3d_log_prob",
+      // "test_softmax_cross_entropy_mean_no_weight_ignore_index_3d",
+      // "test_softmax_cross_entropy_mean_no_weight_ignore_index_4d_expanded",
+      // "test_softmax_cross_entropy_mean_no_weight_ignore_index_4d_log_prob_expanded",
+      // "test_softmax_cross_entropy_mean_no_weight_ignore_index_4d_log_prob",
+      // "test_softmax_cross_entropy_mean_no_weight_ignore_index_4d",
+      // "test_softmax_cross_entropy_mean_no_weight_ignore_index_expanded",
+      // "test_softmax_cross_entropy_mean_no_weight_ignore_index_log_prob_expanded",
+      // "test_softmax_cross_entropy_mean_no_weight_ignore_index_log_prob",
+      // "test_softmax_cross_entropy_mean_no_weight_ignore_index",
+      // "test_softmax_cross_entropy_mean_weight_expanded",
+      // "test_softmax_cross_entropy_mean_weight_ignore_index_3d_expanded",
+      // "test_softmax_cross_entropy_mean_weight_ignore_index_3d_log_prob_expanded",
+      // "test_softmax_cross_entropy_mean_weight_ignore_index_3d_log_prob",
+      // "test_softmax_cross_entropy_mean_weight_ignore_index_3d",
+      // "test_softmax_cross_entropy_mean_weight_ignore_index_4d_expanded",
+      // "test_softmax_cross_entropy_mean_weight_ignore_index_4d_log_prob_expanded",
+      // "test_softmax_cross_entropy_mean_weight_ignore_index_4d_log_prob",
+      // "test_softmax_cross_entropy_mean_weight_ignore_index_4d",
+      // "test_softmax_cross_entropy_mean_weight_ignore_index_expanded",
+      // "test_softmax_cross_entropy_mean_weight_ignore_index_log_prob_expanded",
+      // "test_softmax_cross_entropy_mean_weight_ignore_index_log_prob",
+      // "test_softmax_cross_entropy_mean_weight_ignore_index",
+      // "test_softmax_cross_entropy_mean_weight_log_prob_expanded",
+      // "test_softmax_cross_entropy_mean_weight_log_prob",
+      // "test_softmax_cross_entropy_mean_weight",
+      // "test_softmax_cross_entropy_mean",
+      // "test_softmax_cross_entropy_none_expanded",
+      // "test_softmax_cross_entropy_none_log_prob_expanded",
+      // "test_softmax_cross_entropy_none_log_prob",
+      // "test_softmax_cross_entropy_none_weights_expanded",
+      // "test_softmax_cross_entropy_none_weights_log_prob_expanded",
+      // "test_softmax_cross_entropy_none_weights_log_prob",
+      // "test_softmax_cross_entropy_none_weights",
+      // "test_softmax_cross_entropy_none",
+      // "test_softmax_cross_entropy_sum_expanded",
+      // "test_softmax_cross_entropy_sum_log_prob_expanded",
+      // "test_softmax_cross_entropy_sum_log_prob",
+      // "test_softmax_cross_entropy_sum",
+      // "opset13/test_softmax_default_axis_expanded",
+      "opset13/test_softmax_default_axis",
+      // "test_softmax_example_expanded",
+      "test_softmax_example",
+      // "test_softmax_large_number_expanded",
+      "test_softmax_large_number",
+      // "test_softmax_negative_axis_expanded",
+      "test_softmax_negative_axis",
+      // // "test_softplus_example",
+      // // "test_softplus",
+      // // "test_softsign_example",
+      // // "test_softsign",
+      // "test_spacetodepth_example",
+      // "test_spacetodepth",
+      // "test_split_equal_parts_1d",
+      // "test_split_equal_parts_2d",
+      // "test_split_equal_parts_default_axis",
+      // "test_split_variable_parts_1d",
+      // "test_split_variable_parts_2d",
+      // "test_split_variable_parts_default_axis",
+      // "test_split_zero_size_splits",
+      "test_sqrt_example",
+      "test_sqrt",
+      // "test_squeeze_negative_axes",
+      // "test_squeeze",
+      // // "test_stft_with_window",
+      // // "test_stft",
+      // // "test_strnormalizer_export_monday_casesensintive_lower",
+      // // "test_strnormalizer_export_monday_casesensintive_nochangecase",
+      // // "test_strnormalizer_export_monday_casesensintive_upper",
+      // // "test_strnormalizer_export_monday_empty_output",
+      // // "test_strnormalizer_export_monday_insensintive_upper_twodim",
+      // // "test_strnormalizer_nostopwords_nochangecase",
+      "test_sub_bcast",
+      "test_sub_example",
+      // "test_sub_uint8",
+      "test_sub",
+      // "test_sum_example",
+      // "test_sum_one_input",
+      // "test_sum_two_inputs",
+      "test_tan_example",
+      "test_tan",
+      "test_tanh_example",
+      "test_tanh",
+      // // "test_tfidfvectorizer_tf_batch_onlybigrams_skip0",
+      // // "test_tfidfvectorizer_tf_batch_onlybigrams_skip5",
+      // // "test_tfidfvectorizer_tf_batch_uniandbigrams_skip5",
+      // // "test_tfidfvectorizer_tf_only_bigrams_skip0",
+      // // "test_tfidfvectorizer_tf_onlybigrams_levelempty",
+      // // "test_tfidfvectorizer_tf_onlybigrams_skip5",
+      // // "test_tfidfvectorizer_tf_uniandbigrams_skip5",
+      // "test_thresholdedrelu_default",
+      // "test_thresholdedrelu_example",
+      // "test_thresholdedrelu",
+      // "test_tile_precomputed",
+      // "test_tile",
+      // // "test_top_k_negative_axis",
+      // // "test_top_k_smallest",
+      // // "test_top_k",
+      // // "test_training_dropout_default_mask",
+      // // "test_training_dropout_default",
+      // // "test_training_dropout_mask",
+      // // "test_training_dropout_zero_ratio_mask",
+      // // "test_training_dropout_zero_ratio",
+      // // "test_training_dropout",
+      "test_transpose_all_permutations_0",
+      "test_transpose_all_permutations_1",
+      "test_transpose_all_permutations_2",
+      "test_transpose_all_permutations_3",
+      "test_transpose_all_permutations_4",
+      "test_transpose_all_permutations_5",
+      "test_transpose_default"
+      // "test_tril_neg",
+      // "test_tril_one_row_neg",
+      // "test_tril_out_neg",
+      // "test_tril_out_pos",
+      // "test_tril_pos",
+      // "test_tril_square_neg",
+      // "test_tril_square",
+      // "test_tril_zero",
+      // "test_tril",
+      // "test_triu_neg",
+      // "test_triu_one_row",
+      // "test_triu_out_neg_out",
+      // "test_triu_out_pos",
+      // "test_triu_pos",
+      // "test_triu_square_neg",
+      // "test_triu_square",
+      // "test_triu_zero",
+      // "test_triu",
+      // // "test_unique_not_sorted_without_axis",
+      // // "test_unique_sorted_with_axis_3d",
+      // // "test_unique_sorted_with_axis",
+      // // "test_unique_sorted_with_negative_axis",
+      // // "test_unique_sorted_without_axis",
+      // "test_unsqueeze_axis_0",
+      // "test_unsqueeze_axis_1",
+      // "test_unsqueeze_axis_2",
+      // "test_unsqueeze_axis_3",
+      // "test_unsqueeze_negative_axes",
+      // "test_unsqueeze_three_axes",
+      // "test_unsqueeze_two_axes",
+      // "test_unsqueeze_unsorted_axes",
+      // "test_unsqueeze",
+      // "test_wrap_pad"
+      // "test_upsample_nearest",
+      // "test_where_example",
+      // "test_where_long_example",
+      // "test_xor_bcast3v1d",
+      // "test_xor_bcast3v2d",
+      // "test_xor_bcast4v2d",
+      // "test_xor_bcast4v3d",
+      // "test_xor_bcast4v4d",
+      // "test_xor2d",
+      // "test_xor3d",
+      // "test_xor4d"
     ],
     "ops": []
   }
diff --git a/js/web/test/test-main.ts b/js/web/test/test-main.ts
index 9bd0ec1425f95..2d83ce1e095ce 100644
--- a/js/web/test/test-main.ts
+++ b/js/web/test/test-main.ts
@@ -110,8 +110,7 @@ for (const group of ORT_WEB_TEST_CONFIG.model) {
         let context: ModelTestContext;
 
         before('prepare session', async () => {
-          context = await ModelTestContext.create(
-              test, ORT_WEB_TEST_CONFIG.profile, ORT_WEB_TEST_CONFIG.options.sessionOptions);
+          context = await ModelTestContext.create(test, ORT_WEB_TEST_CONFIG.profile, ORT_WEB_TEST_CONFIG.options);
         });
 
         after('release session', async () => {
diff --git a/js/web/test/test-runner.ts b/js/web/test/test-runner.ts
index 5e9b0910a2c68..6d5951be7b1e6 100644
--- a/js/web/test/test-runner.ts
+++ b/js/web/test/test-runner.ts
@@ -137,8 +137,9 @@ async function loadTensors(
 }
 
 async function initializeSession(
-    modelFilePath: string, backendHint: string, ioBindingMode: Test.IOBindingMode, profile: boolean,
-    sessionOptions: ort.InferenceSession.SessionOptions, fileCache?: FileCacheBuffer): Promise<ort.InferenceSession> {
+    modelFilePath: string, backendHint: ort.InferenceSession.ExecutionProviderConfig, ioBindingMode: Test.IOBindingMode,
+    profile: boolean, sessionOptions: ort.InferenceSession.SessionOptions,
+    fileCache?: FileCacheBuffer): Promise<ort.InferenceSession> {
   const preloadModelData: Uint8Array|undefined =
       fileCache && fileCache[modelFilePath] ? fileCache[modelFilePath] : undefined;
   Logger.verbose(
@@ -232,9 +233,8 @@ export class ModelTestContext {
   /**
    * create a ModelTestContext object that used in every test cases in the given ModelTest.
    */
-  static async create(
-      modelTest: Test.ModelTest, profile: boolean,
-      sessionOptions?: ort.InferenceSession.SessionOptions): Promise<ModelTestContext> {
+  static async create(modelTest: Test.ModelTest, profile: boolean, testOptions?: Test.Options):
+      Promise<ModelTestContext> {
     if (this.initializing) {
       throw new Error('cannot create a ModelTestContext object when the previous creation is not done');
     }
@@ -243,8 +243,12 @@ export class ModelTestContext {
       this.initializing = true;
 
       const initStart = now();
+      const executionProviderConfig =
+          modelTest.backend === 'webnn' ? (testOptions?.webnnOptions || 'webnn') : modelTest.backend!;
       const session = await initializeSession(
-          modelTest.modelUrl, modelTest.backend!, modelTest.ioBinding, profile, sessionOptions || {}, this.cache);
+          modelTest.modelUrl, executionProviderConfig, modelTest.ioBinding, profile, testOptions?.sessionOptions || {},
+          this.cache);
+
       const initEnd = now();
 
       for (const testCase of modelTest.cases) {
diff --git a/js/web/test/test-types.ts b/js/web/test/test-types.ts
index 5bdc8d84cc7a5..cd008e82e570b 100644
--- a/js/web/test/test-types.ts
+++ b/js/web/test/test-types.ts
@@ -143,6 +143,7 @@ export declare namespace Test {
     cudaFlags?: Record<string, unknown>;
     wasmOptions?: InferenceSession.WebAssemblyExecutionProviderOption;
     webglOptions?: InferenceSession.WebGLExecutionProviderOption;
+    webnnOptions?: InferenceSession.WebNNExecutionProviderOption;
     globalEnvFlags?: EnvOptions;
   }
 

From 574c7caf3ac4c6b0c1004b1fd9e2b020c19d441b Mon Sep 17 00:00:00 2001
From: Yifan Li <109183385+yf711@users.noreply.github.com>
Date: Tue, 9 Jan 2024 10:29:06 -0800
Subject: [PATCH 15/29] [TensorRT EP] Clear constrain of trt plugin with
 different input type  (#19044)

### Description
<!-- Describe your changes. -->
Add heterogeneous support to skip this check for TRT plugin which has
different input tensor types


### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
---
 .../tensorrt/tensorrt_execution_provider_custom_ops.h     | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_custom_ops.h b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_custom_ops.h
index 35bd38d818979..b19d9ab0f66d0 100644
--- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_custom_ops.h
+++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_custom_ops.h
@@ -56,6 +56,14 @@ struct TensorRTCustomOp : Ort::CustomOpBase<TensorRTCustomOp, TensorRTCustomKern
 
   OrtCustomOpInputOutputCharacteristic GetOutputCharacteristic(size_t) const { return OrtCustomOpInputOutputCharacteristic::INPUT_OUTPUT_VARIADIC; };
 
+  bool GetVariadicInputHomogeneity() const {
+    return false;  // heterogenous
+  }
+
+  bool GetVariadicOutputHomogeneity() const {
+    return false;  // heterogeneous
+  }
+
  private:
   const char* provider_{onnxruntime::kTensorrtExecutionProvider};
   void* compute_stream_;

From 897a4163d7638f680902114967e8888f42ee665b Mon Sep 17 00:00:00 2001
From: Ashwini Khade <askhade@microsoft.com>
Date: Tue, 9 Jan 2024 12:00:34 -0800
Subject: [PATCH 16/29] Update transformer version for training CIs (#19046)

### Description
Updating version to resolve security vulnerability.
---
 .../stage1/requirements_torch_nightly/requirements.txt        | 4 ++--
 .../docker/scripts/training/ortmodule/stage2/requirements.txt | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/tools/ci_build/github/linux/docker/scripts/training/ortmodule/stage1/requirements_torch_nightly/requirements.txt b/tools/ci_build/github/linux/docker/scripts/training/ortmodule/stage1/requirements_torch_nightly/requirements.txt
index 0cd5e5c5d5c46..01fa7b0ff956e 100644
--- a/tools/ci_build/github/linux/docker/scripts/training/ortmodule/stage1/requirements_torch_nightly/requirements.txt
+++ b/tools/ci_build/github/linux/docker/scripts/training/ortmodule/stage1/requirements_torch_nightly/requirements.txt
@@ -1,5 +1,5 @@
 scikit-learn
 packaging==21.3
-transformers==v4.30.0
-accelerate==0.20.1
+transformers==v4.36.0
+accelerate==0.25.0
 wget
diff --git a/tools/ci_build/github/linux/docker/scripts/training/ortmodule/stage2/requirements.txt b/tools/ci_build/github/linux/docker/scripts/training/ortmodule/stage2/requirements.txt
index b4b265f65b69f..2b557f2aee00f 100644
--- a/tools/ci_build/github/linux/docker/scripts/training/ortmodule/stage2/requirements.txt
+++ b/tools/ci_build/github/linux/docker/scripts/training/ortmodule/stage2/requirements.txt
@@ -2,8 +2,8 @@ pandas
 scikit-learn
 numpy==1.21.6 ; python_version < '3.11'
 numpy==1.24.2 ; python_version >= '3.11'
-transformers==v4.30.0
-accelerate
+transformers==v4.36.0
+accelerate==0.25.0
 rsa==4.9
 tensorboard==2.13.0
 h5py

From a2afd92093fea3351b8dce80b142e3eb70dcf022 Mon Sep 17 00:00:00 2001
From: Changming Sun <chasun@microsoft.com>
Date: Tue, 9 Jan 2024 13:41:10 -0800
Subject: [PATCH 17/29] Format TS code (#19066)

### Description
Format code
---
 js/web/script/build.ts | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/js/web/script/build.ts b/js/web/script/build.ts
index 11892881a2adf..a52ac4454a5c1 100644
--- a/js/web/script/build.ts
+++ b/js/web/script/build.ts
@@ -369,11 +369,7 @@ async function main() {
 
   if (BUNDLE_MODE === 'dev') {
     // ort.all.js
-    await addBuildTask(buildOrt({
-      outputBundleName: 'ort.all',
-      format: 'iife',
-      define: {...DEFAULT_DEFINE}
-    }));
+    await addBuildTask(buildOrt({outputBundleName: 'ort.all', format: 'iife', define: {...DEFAULT_DEFINE}}));
   }
 
   if (BUNDLE_MODE === 'perf') {

From 37ac9d391ccd2bb3ea331d280541b675d650209b Mon Sep 17 00:00:00 2001
From: Milos Puzovic <79916358+milpuz01@users.noreply.github.com>
Date: Tue, 9 Jan 2024 22:10:25 +0000
Subject: [PATCH 18/29] Enable Arm Compute Library 23.08 (#17672)

### Description

This PR enables onnxruntime to build with the most recent release of Arm
Compute Library

### Motivation and Context

The latest version of Arm Compute Library that onnxruntime can build is
20.02 which is more than 3 years old.
---
 cmake/CMakeLists.txt                          |  9 ++++--
 onnxruntime/core/providers/acl/math/gemm.h    | 16 ++++++++--
 .../core/providers/acl/nn/batch_norm.cc       | 32 ++++++++++++++++---
 .../core/providers/acl/nn/batch_norm.h        |  4 +--
 onnxruntime/core/providers/acl/nn/conv.cc     | 17 ++++++++--
 onnxruntime/core/providers/acl/nn/conv.h      |  3 ++
 onnxruntime/core/providers/acl/nn/pool.cc     | 17 ++++++++--
 .../core/providers/acl/tensor/concat.cc       | 32 ++++++++++++++++---
 tools/ci_build/build.py                       |  3 +-
 9 files changed, 113 insertions(+), 20 deletions(-)

diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt
index 0f57258dca706..1567da90cacfc 100644
--- a/cmake/CMakeLists.txt
+++ b/cmake/CMakeLists.txt
@@ -131,6 +131,7 @@ option(onnxruntime_USE_ACL_1902 "Build with ACL version 1902 support" OFF)
 option(onnxruntime_USE_ACL_1905 "Build with ACL version 1905 support" OFF)
 option(onnxruntime_USE_ACL_1908 "Build with ACL version 1908 support" OFF)
 option(onnxruntime_USE_ACL_2002 "Build with ACL version 2002 support" OFF)
+option(onnxruntime_USE_ACL_2308 "Build with ACL version 2308 support" OFF)
 option(onnxruntime_USE_ARMNN "Build with ArmNN support" OFF)
 option(onnxruntime_ARMNN_RELU_USE_CPU "Use the CPU implementation for the Relu operator for the ArmNN EP" ON)
 option(onnxruntime_ARMNN_BN_USE_CPU "Use the CPU implementation for the Batch Normalization operator for the ArmNN EP" ON)
@@ -1110,7 +1111,7 @@ function(onnxruntime_add_include_to_target dst_target)
 endfunction()
 
 # ACL
-if (onnxruntime_USE_ACL OR onnxruntime_USE_ACL_1902 OR onnxruntime_USE_ACL_1905 OR onnxruntime_USE_ACL_1908 OR onnxruntime_USE_ACL_2002)
+if (onnxruntime_USE_ACL OR onnxruntime_USE_ACL_1902 OR onnxruntime_USE_ACL_1905 OR onnxruntime_USE_ACL_1908 OR onnxruntime_USE_ACL_2002 OR onnxruntime_USE_ACL_2308)
   set(onnxruntime_USE_ACL ON)
   if (onnxruntime_USE_ACL_1902)
     add_definitions(-DACL_1902=1)
@@ -1121,7 +1122,11 @@ if (onnxruntime_USE_ACL OR onnxruntime_USE_ACL_1902 OR onnxruntime_USE_ACL_1905
       if (onnxruntime_USE_ACL_2002)
         add_definitions(-DACL_2002=1)
       else()
-        add_definitions(-DACL_1905=1)
+	if (onnxruntime_USE_ACL_2308)
+	  add_definitions(-DACL_2308=1)
+	else()
+          add_definitions(-DACL_1905=1)
+	endif()
       endif()
     endif()
   endif()
diff --git a/onnxruntime/core/providers/acl/math/gemm.h b/onnxruntime/core/providers/acl/math/gemm.h
index d2f297e83aedb..f5288d7f231b0 100644
--- a/onnxruntime/core/providers/acl/math/gemm.h
+++ b/onnxruntime/core/providers/acl/math/gemm.h
@@ -49,11 +49,18 @@ class Gemm : public onnxruntime::Gemm<T> {
   }
 
   Status Compute(OpKernelContext* context) const override {
+#ifdef ACL_2308
+    if (this->packed_b_) {
+      // Prepacked RHS not supported, defaulting to cpu execution provider
+      return onnxruntime::Gemm<T>::Compute(context);
+    }
+#endif
     const auto A = context->Input<Tensor>(0);
     const auto B = context->Input<Tensor>(1);
     const auto C = context->Input<Tensor>(2);
 
-    GemmHelper helper(A->Shape(), trans_A_ != CblasNoTrans, B->Shape(), trans_B_ != CblasNoTrans, C->Shape());
+    GemmHelper helper(A->Shape(), trans_A_ != CblasNoTrans, B->Shape(), trans_B_ != CblasNoTrans,
+                      C != nullptr ? C->Shape() : TensorShape({}));
 
     if (!helper.State().IsOK())
       return helper.State();
@@ -70,7 +77,7 @@ class Gemm : public onnxruntime::Gemm<T> {
       return onnxruntime::Gemm<T>::Compute(context);
     }
 
-    arm_compute::TensorShape cShape = ACLTensorShape(C->Shape());
+    arm_compute::TensorShape cShape = ACLTensorShape(C != nullptr ? C->Shape() : TensorShape({}));
     if (useC &&
         (cShape.num_dimensions() > 2 ||
          (cShape.num_dimensions() == 2 && cShape[0] > 1 && cShape[1] > 1))) {  // Multi-dimensional Bias
@@ -89,8 +96,13 @@ class Gemm : public onnxruntime::Gemm<T> {
           (cShape[1] == 1 && cShape[0] != (long unsigned int)N)) {
         return onnxruntime::Gemm<T>::Compute(context);
       }
+#ifdef ACL_2308
+      cShape = arm_compute::TensorShape(N);
+      LOGS_DEFAULT(VERBOSE) << "Bias reshaped to: {" << N << "}";
+#else
       cShape = arm_compute::TensorShape(1, N);
       LOGS_DEFAULT(VERBOSE) << "Bias reshaped to: {1," << N << "}";
+#endif
     }
 
     int64_t K = helper.K();
diff --git a/onnxruntime/core/providers/acl/nn/batch_norm.cc b/onnxruntime/core/providers/acl/nn/batch_norm.cc
index da7fff730c96f..eb6a10074f1db 100755
--- a/onnxruntime/core/providers/acl/nn/batch_norm.cc
+++ b/onnxruntime/core/providers/acl/nn/batch_norm.cc
@@ -44,6 +44,16 @@ Status BatchNorm<T>::Compute(OpKernelContext* context) const {
   const Tensor* M = context->Input<Tensor>(3);  // mean
   const Tensor* V = context->Input<Tensor>(4);  // var
 
+  if (S->Shape().NumDimensions() > 1) {
+    LOGS_DEFAULT(WARNING) << "ACL does not support scale with dimension greater then 1; defaulting to cpu implementation";
+    return onnxruntime::BatchNorm<T>::Compute(context);
+  }
+
+  if (this->is_train_) {
+    LOGS_DEFAULT(WARNING) << "ACL does not have batchnorm training support; defaulting to cpu implementation";
+    return onnxruntime::BatchNorm<T>::Compute(context);
+  }
+
   ORT_RETURN_IF_ERROR(BatchNormHelper::ValidateInputs(X, S, B, M, V));
 
   LOGS_DEFAULT(VERBOSE) << "BatchNorm ACL:";
@@ -70,7 +80,23 @@ Status BatchNorm<T>::Compute(OpKernelContext* context) const {
 
     auto layer = std::make_shared<arm_compute::NEBatchNormalizationLayer>();
 
+#ifdef ACL_2308
+    arm_compute::TensorShape in_x_shape;
+    const TensorShape& x_shape = X->Shape();
+    const auto& dims_vec = x_shape.GetDims();
+    in_x_shape.set(3, onnxruntime::narrow<size_t>(dims_vec[0]));  // N
+    in_x_shape.set(1, 1);                                         // H
+    size_t W = 1;
+    for (size_t i = 2; i < dims_vec.size(); ++i) {
+      W *= narrow<size_t>(dims_vec[i]);
+    }
+    in_x_shape.set(0, W);                                         // W
+    in_x_shape.set(2, onnxruntime::narrow<size_t>(dims_vec[1]));  // C
+
+    tbatch_norm.in->allocator()->init(arm_compute::TensorInfo(in_x_shape, arm_compute::Format::F32));
+#else
     tbatch_norm.in->allocator()->init(arm_compute::TensorInfo(ACLTensorShape(X->Shape()), arm_compute::Format::F32));
+#endif
     tbatch_norm.out->allocator()->init(arm_compute::TensorInfo(tbatch_norm.in->info()->tensor_shape(), arm_compute::Format::F32));
 
     tbatch_norm.scale->allocator()->init(arm_compute::TensorInfo(ACLTensorShape(S->Shape()), arm_compute::Format::F32));
@@ -132,11 +158,7 @@ ONNX_OPERATOR_VERSIONED_KERNEL_EX(
     7, 9,
     kAclExecutionProvider,
     KernelDefBuilder()
-        .TypeConstraint("X", DataTypeImpl::GetTensorType<float>())
-        .TypeConstraint("scale", DataTypeImpl::GetTensorType<float>())
-        .TypeConstraint("B", DataTypeImpl::GetTensorType<float>())
-        .TypeConstraint("mean", DataTypeImpl::GetTensorType<float>())
-        .TypeConstraint("var", DataTypeImpl::GetTensorType<float>()),
+        .TypeConstraint("T", DataTypeImpl::GetTensorType<float>()),
     BatchNorm<float>);
 
 }  // namespace acl
diff --git a/onnxruntime/core/providers/acl/nn/batch_norm.h b/onnxruntime/core/providers/acl/nn/batch_norm.h
index c9ec08b67a779..264301976e6dc 100755
--- a/onnxruntime/core/providers/acl/nn/batch_norm.h
+++ b/onnxruntime/core/providers/acl/nn/batch_norm.h
@@ -31,9 +31,9 @@ typedef struct {
 typedef std::map<OpKernel*, ACLNEBatchNorm>::iterator BatchNormLayersIterator;
 
 template <typename T>
-class BatchNorm final : public OpKernel {
+class BatchNorm : public onnxruntime::BatchNorm<T> {
  public:
-  explicit BatchNorm(const OpKernelInfo& info) : OpKernel(info) {
+  explicit BatchNorm(const OpKernelInfo& info) : onnxruntime::BatchNorm<T>(info) {
     auto st = info.GetAttr<float>("epsilon", &epsilon_);
     ORT_ENFORCE(st.IsOK(), st.ErrorMessage());
 
diff --git a/onnxruntime/core/providers/acl/nn/conv.cc b/onnxruntime/core/providers/acl/nn/conv.cc
index 1613d927d0f74..85bd0cfe96279 100644
--- a/onnxruntime/core/providers/acl/nn/conv.cc
+++ b/onnxruntime/core/providers/acl/nn/conv.cc
@@ -105,7 +105,11 @@ Status Conv<T>::Compute(OpKernelContext* context) const {
   TensorShapeVector Y_dims;
   Y_dims.insert(Y_dims.begin(), {N, M});
   TensorShape input_shape = X->Shape().Slice(2);
+#ifdef ACL_2308
+  ORT_RETURN_IF_ERROR(conv_attrs_.InferPadsAndOutputShape(input_shape, kernel_shape, strides, dilations, pads, Y_dims));
+#else
   ORT_RETURN_IF_ERROR(conv_attrs_.InferOutputShape(input_shape, kernel_shape, strides, dilations, pads, Y_dims));
+#endif
   Tensor* Y = context->Output(0, TensorShape(Y_dims));
   LOGS_DEFAULT(VERBOSE) << "Y " << Y->Shape().ToString().c_str();
 
@@ -222,6 +226,15 @@ Status Conv<T>::Compute(OpKernelContext* context) const {
                                                                                           1 /* depth multiplier */,
                                                                                           acl_activ_enabled ? arm_compute::ActivationLayerInfo(acl_activ_func, conv_attrs_.alpha) : arm_compute::ActivationLayerInfo(),
                                                                                           arm_compute::Size2D(aclDilation0, dilations[0])));
+#elif defined(ACL_2308)
+      bool optimizable = bool(arm_compute::NEDepthwiseConvolutionLayer::validate(tconv.in->info(),
+                                                                                 tconv.k->info(),
+                                                                                 (B != nullptr) ? tconv.b->info() : nullptr,
+                                                                                 tconv.out->info(),
+                                                                                 aclPadStride,
+                                                                                 1 /* depth multiplier */,
+                                                                                 acl_activ_enabled ? arm_compute::ActivationLayerInfo(acl_activ_func, conv_attrs_.alpha) : arm_compute::ActivationLayerInfo(),
+                                                                                 arm_compute::Size2D(aclDilation0, dilations[0])));
 #endif
 
       if (optimizable) {
@@ -230,7 +243,7 @@ Status Conv<T>::Compute(OpKernelContext* context) const {
         auto layer = std::make_shared<arm_compute::NEDepthwiseConvolutionLayer3x3>();
 #elif defined(ACL_1908)
         auto layer = std::make_shared<arm_compute::NEDepthwiseConvolutionLayerOptimized>();
-#elif defined(ACL_2002)
+#elif defined(ACL_2002) || defined(ACL_2308)
         auto layer = std::make_shared<arm_compute::NEDepthwiseConvolutionLayer>();
 #endif
 
@@ -238,7 +251,7 @@ Status Conv<T>::Compute(OpKernelContext* context) const {
         layer->configure(tconv.in.get(), tconv.k.get(), (B != nullptr) ? tconv.b.get() : nullptr, tconv.out.get(),
                          aclPadStride, 1 /* depth multiplier */,
                          acl_activ_enabled ? arm_compute::ActivationLayerInfo(acl_activ_func, conv_attrs_.alpha) : arm_compute::ActivationLayerInfo());
-#elif defined(ACL_1905) || defined(ACL_1908) || defined(ACL_2002)
+#elif defined(ACL_1905) || defined(ACL_1908) || defined(ACL_2002) || defined(ACL_2308)
         layer->configure(tconv.in.get(), tconv.k.get(), (B != nullptr) ? tconv.b.get() : nullptr, tconv.out.get(),
                          aclPadStride, 1 /* depth multiplier */,
                          acl_activ_enabled ? arm_compute::ActivationLayerInfo(acl_activ_func, conv_attrs_.alpha) : arm_compute::ActivationLayerInfo(),
diff --git a/onnxruntime/core/providers/acl/nn/conv.h b/onnxruntime/core/providers/acl/nn/conv.h
index ecb11fb3c8f4e..660d47b4172df 100644
--- a/onnxruntime/core/providers/acl/nn/conv.h
+++ b/onnxruntime/core/providers/acl/nn/conv.h
@@ -8,6 +8,9 @@
 #include "core/providers/acl/acl_execution_provider.h"
 
 // ACL
+#ifdef ACL_2308
+#include "arm_compute/runtime/Tensor.h"
+#endif
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/runtime/TensorAllocator.h"
 #include "arm_compute/runtime/Allocator.h"
diff --git a/onnxruntime/core/providers/acl/nn/pool.cc b/onnxruntime/core/providers/acl/nn/pool.cc
index dc79ae65bf21e..8fbcba3ed87a7 100644
--- a/onnxruntime/core/providers/acl/nn/pool.cc
+++ b/onnxruntime/core/providers/acl/nn/pool.cc
@@ -61,7 +61,14 @@ ACLNEPool PoolOperation(onnxruntime::OpKernelContext* context,
     tpool.out->allocator()->init(arm_compute::TensorInfo(ACLTensorShape(Y->Shape(), PREF_DIM), arm_compute::Format::F32));
 
     if (pool_attrs.global_pooling) {
-      layer->configure(tpool.in.get(), tpool.out.get(), arm_compute::PoolingLayerInfo(pool_type));
+      layer->configure(tpool.in.get(),
+                       tpool.out.get(),
+                       arm_compute::PoolingLayerInfo(pool_type
+#ifdef ACL_2308
+                                                     ,
+                                                     arm_compute::DataLayout::NCHW
+#endif
+                                                     ));
     } else {
       TensorShapeVector aclStrides(2);
       aclStrides[0] = (strides.size() == 2) ? strides[1] : 1;
@@ -104,7 +111,13 @@ ACLNEPool PoolOperation(onnxruntime::OpKernelContext* context,
       LOGS_DEFAULT(VERBOSE) << "strides: {" << aclStrides[0] << "," << aclStrides[1] << "}";
       LOGS_DEFAULT(VERBOSE) << "excludePadding: " << excludePadding;
 
-      arm_compute::PoolingLayerInfo pool_info(pool_type, aclSize, aclPadStride, excludePadding);
+      arm_compute::PoolingLayerInfo pool_info(pool_type,
+                                              aclSize,
+#ifdef ACL_2308
+                                              arm_compute::DataLayout::NCHW,
+#endif
+                                              aclPadStride,
+                                              excludePadding);
       layer->configure(tpool.in.get(), tpool.out.get(), pool_info);
     }
 
diff --git a/onnxruntime/core/providers/acl/tensor/concat.cc b/onnxruntime/core/providers/acl/tensor/concat.cc
index 081472729cfcf..75eedaac80aea 100644
--- a/onnxruntime/core/providers/acl/tensor/concat.cc
+++ b/onnxruntime/core/providers/acl/tensor/concat.cc
@@ -10,6 +10,8 @@
 #include "core/providers/acl/acl_common.h"
 #include "core/providers/acl/acl_fwd.h"
 
+#include <iostream>
+
 #define PREF_DIM 4
 
 namespace onnxruntime {
@@ -22,17 +24,27 @@ Status Concat<T>::Compute(OpKernelContext* ctx) const {
     return onnxruntime::Concat::Compute(ctx);
   }
 
+  if (axis_ < 0) {
+    LOGS_DEFAULT(WARNING) << "ACL does not have support for negative axis; defaulting to cpu implementation";
+    return onnxruntime::Concat::Compute(ctx);
+  }
+
   // Number of input tensors to concatenate
   auto input_count = Node().InputArgCount().front();
 
   // Hold pointers to the input tensors to be used in the PrepareForCompute() step
   std::vector<const Tensor*> input_tensors;
-  input_tensors.reserve(input_count);
+  int empty_tensors = 0;
   for (int i = 0; i < input_count; ++i) {
+    if (ctx->Input<Tensor>(i)->Shape().Size() == 0) {
+      empty_tensors++;
+      continue;
+    }
     input_tensors.push_back(ctx->Input<Tensor>(i));
   }
+  input_count -= empty_tensors;
 
-  auto output_dims = input_tensors[0]->Shape().AsShapeVector();
+  auto output_dims = ctx->Input<Tensor>(0)->Shape().AsShapeVector();
 
   // 'Concat' mode
   if (!is_stack_) {
@@ -64,7 +76,11 @@ Status Concat<T>::Compute(OpKernelContext* ctx) const {
   LOGS_DEFAULT(VERBOSE) << "Concat ACL:";
 
   arm_compute::Tensor output;
+#ifdef ACL_2308
+  std::vector<const arm_compute::ITensor*> inputs_vector;
+#else
   std::vector<arm_compute::ITensor*> inputs_vector;
+#endif
   for (int i = 0; i < input_count; i++) {
     arm_compute::Tensor* input = new arm_compute::Tensor();
     auto X = input_tensors[i];
@@ -75,7 +91,9 @@ Status Concat<T>::Compute(OpKernelContext* ctx) const {
   }
 
   arm_compute::NEConcatenateLayer layer;
-  layer.configure(inputs_vector, &output, 3 - axis_);
+  if (input_count > 0) {
+    layer.configure(inputs_vector, &output, 3 - axis_);
+  }
 
   LOGS_DEFAULT(VERBOSE) << "axis: " << axis_;
   LOGS_DEFAULT(VERBOSE) << std::endl;
@@ -83,7 +101,11 @@ Status Concat<T>::Compute(OpKernelContext* ctx) const {
   for (int i = 0; i < input_count; i++) {
     auto X = input_tensors[i];
     const T* x_data = X->Data<T>();
+#ifdef ACL_2308
+    arm_compute::Tensor* in = const_cast<arm_compute::Tensor*>(static_cast<const arm_compute::Tensor*>(inputs_vector[i]));
+#else
     arm_compute::Tensor* in = static_cast<arm_compute::Tensor*>(inputs_vector[i]);
+#endif
 
     if (X->Shape().Size() != 0 && in->info()->has_padding()) {
       in->allocator()->allocate();
@@ -101,7 +123,9 @@ Status Concat<T>::Compute(OpKernelContext* ctx) const {
     ACLImportMemory(output.allocator(), (void*)y_data, Y->Shape().Size() * 4);
   }
 
-  layer.run();
+  if (input_count > 0) {
+    layer.run();
+  }
 
   if (Y->Shape().Size() != 0 && output.info()->has_padding()) {
     importDataFromTensor<T>(&output, y_data);
diff --git a/tools/ci_build/build.py b/tools/ci_build/build.py
index c655100fbf475..3d0ec92a7bd23 100644
--- a/tools/ci_build/build.py
+++ b/tools/ci_build/build.py
@@ -606,7 +606,7 @@ def convert_arg_line_to_args(self, arg_line):
         "--use_acl",
         nargs="?",
         const="ACL_1905",
-        choices=["ACL_1902", "ACL_1905", "ACL_1908", "ACL_2002"],
+        choices=["ACL_1902", "ACL_1905", "ACL_1908", "ACL_2002", "ACL_2308"],
         help="Build with ACL for ARM architectures.",
     )
     parser.add_argument("--acl_home", help="Path to ACL home dir")
@@ -1031,6 +1031,7 @@ def generate_build_tree(
         "-Donnxruntime_USE_ACL_1905=" + ("ON" if args.use_acl == "ACL_1905" else "OFF"),
         "-Donnxruntime_USE_ACL_1908=" + ("ON" if args.use_acl == "ACL_1908" else "OFF"),
         "-Donnxruntime_USE_ACL_2002=" + ("ON" if args.use_acl == "ACL_2002" else "OFF"),
+        "-Donnxruntime_USE_ACL_2308=" + ("ON" if args.use_acl == "ACL_2308" else "OFF"),
         "-Donnxruntime_USE_ARMNN=" + ("ON" if args.use_armnn else "OFF"),
         "-Donnxruntime_ARMNN_RELU_USE_CPU=" + ("OFF" if args.armnn_relu else "ON"),
         "-Donnxruntime_ARMNN_BN_USE_CPU=" + ("OFF" if args.armnn_bn else "ON"),

From 76dfe5347cfdc00a555fb4e1867acf77dad5c819 Mon Sep 17 00:00:00 2001
From: Xu Xing <xing.xu@intel.com>
Date: Wed, 10 Jan 2024 06:56:00 +0800
Subject: [PATCH 19/29] [js/webgpu] Support uniforms for instance-norm (#18929)

Co-authored-by: Guenther Schmuelling <guschmue@microsoft.com>
---
 .../lib/wasm/jsep/webgpu/op-resolve-rules.ts  |   4 +-
 .../lib/wasm/jsep/webgpu/ops/instance-norm.ts | 182 ++++++++++--------
 js/web/test/data/ops/instance-norm.jsonc      | 149 ++++++++++++++
 3 files changed, 249 insertions(+), 86 deletions(-)

diff --git a/js/web/lib/wasm/jsep/webgpu/op-resolve-rules.ts b/js/web/lib/wasm/jsep/webgpu/op-resolve-rules.ts
index ed174bc0982eb..286dfaf968211 100644
--- a/js/web/lib/wasm/jsep/webgpu/op-resolve-rules.ts
+++ b/js/web/lib/wasm/jsep/webgpu/op-resolve-rules.ts
@@ -17,7 +17,7 @@ import {gather, parseGatherAttributes} from './ops/gather';
 import {gatherElements, parseGatherElementsAttributes} from './ops/gather-elements';
 import {gemm, parseGemmAttributes} from './ops/gemm';
 import {instanceNorm, parseInstanceNormAttributes} from './ops/instance-norm';
-import {layerNorm} from './ops/layer-norm';
+import {layerNorm, parseLayerNormAttributes} from './ops/layer-norm';
 import {matMul} from './ops/matmul';
 import {multiHeadAttention, parseMultiHeadAttentionAttributes} from './ops/multi-head-attentiion';
 import {pad} from './ops/pad';
@@ -83,7 +83,7 @@ export const WEBGPU_OP_RESOLVE_RULES: Map<string, OperatorImplementation> = new
   ['Greater', [binaryOps.greater]],
   ['GreaterOrEqual', [binaryOps.greaterOrEqual]],
   ['InstanceNormalization', [instanceNorm, parseInstanceNormAttributes]],
-  ['LayerNormalization', [layerNorm]],
+  ['LayerNormalization', [layerNorm, parseLayerNormAttributes]],
   ['LeakyRelu', [unaryOps.leakyRelu, unaryOps.parseAlphaAttributes]],
   ['Less', [binaryOps.less]],
   ['LessOrEqual', [binaryOps.lessOrEqual]],
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/instance-norm.ts b/js/web/lib/wasm/jsep/webgpu/ops/instance-norm.ts
index 3a84844544c96..056dd54d54591 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/instance-norm.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/instance-norm.ts
@@ -4,58 +4,56 @@
 import {DataType} from '../../../wasm-common';
 import {TensorView} from '../../tensor-view';
 import {ShapeUtil} from '../../util';
-import {AttributeWithCacheKey, createAttributeWithCacheKey} from '../attribute-with-cache-key';
-import {ComputeContext, ProgramInfo} from '../types';
+import {ComputeContext, ProgramInfo, ProgramInputTensorInfoDependency, ProgramUniform} from '../types';
 
-import {fillVector, getMaxComponents, inputVariable, outputVariable, ShaderHelper, sumVector, tensorTypeToWsglStorageType} from './common';
+import {createTensorShapeVariables, fillVector, getMaxComponents, inputVariable, outputVariable, ShaderHelper, sumVector, tensorTypeToWsglStorageType, UniformsArrayType} from './common';
 
-export interface InstanceNormAttributes extends AttributeWithCacheKey {
+export interface InstanceNormAttributes {
   epsilon: number;
   format: 'NHWC'|'NCHW';
 }
 
-const metadata = {
-  name: 'InstanceNormalization'
-};
-
 const createInstanceNormProgramInfo =
     (inputs: readonly TensorView[], attributes: InstanceNormAttributes): ProgramInfo => {
       const xShape = inputs[0].dims;
-
       const outputShape = xShape;
       const axis = 2;
       const normCount = ShapeUtil.sizeToDimension(xShape, axis);
       const normSize = ShapeUtil.sizeFromDimension(xShape, axis);
       const components = getMaxComponents(normSize);
       const normPackedSize = normSize / components;
-      const C = xShape[1];
-      const x = inputVariable('x', inputs[0].dataType, [xShape[0], xShape[1], normPackedSize], components);
-      const scale = inputVariable('scale', inputs[1].dataType, inputs[1].dims);
-      const bias = inputVariable('bias', inputs[2].dataType, inputs[2].dims);
-      const output = outputVariable('output', inputs[0].dataType, [xShape[0], xShape[1], normPackedSize], components);
-      const variables = [x, scale, bias, output];
-      const dataType = x.type.value;
-      const f32Type = components === 1 ? 'f32' : `vec${components}<f32>`;
-      const workgroupSize = 64;
-      const getShaderSource = (shaderHelper: ShaderHelper) => `
-
-  const C: u32 = ${C};
-  const normSize: u32 = ${normSize};
-  const epsilon: f32 = ${attributes.epsilon};
+      const inputShape = [xShape[0], xShape[1], normPackedSize];
+      const inputDependencies: ProgramInputTensorInfoDependency[] = ['rank', 'type', 'type'];
+      const programUniforms: ProgramUniform[] =
+          [{type: 'uint32', data: normSize}, {type: 'uint32', data: normPackedSize}];
+      programUniforms.push(...createTensorShapeVariables(inputShape), ...createTensorShapeVariables(inputShape));
+
+      const getShaderSource = (shaderHelper: ShaderHelper) => {
+        const x = inputVariable('x', inputs[0].dataType, inputShape.length, components);
+        const scale = inputVariable('scale', inputs[1].dataType, inputs[1].dims);
+        const bias = inputVariable('bias', inputs[2].dataType, inputs[2].dims);
+        const output = outputVariable('output', inputs[0].dataType, inputShape.length, components);
+        const variables = [x, scale, bias, output];
+        const dataType = x.type.value;
+        const f32Type = components === 1 ? 'f32' : `vec${components}<f32>`;
+        const workgroupSize = 64;
+
+        const uniforms: UniformsArrayType = [{name: 'normSize', type: 'u32'}, {name: 'normPackedSize', type: 'u32'}];
+        return `
   var<workgroup> meanShared : f32;
   var<workgroup> squaredNormShared : f32;
   var<workgroup> workgroupShared : array<${f32Type}, ${workgroupSize}>;
   const workgroupSize = ${workgroupSize}u;
-  ${shaderHelper.declareVariables(...variables)}
+  ${shaderHelper.registerUniforms(uniforms).declareVariables(...variables)}
   ${shaderHelper.mainStart(workgroupSize)}
     let norm = global_idx / workgroupSize;
-    let batch = norm / C;
-    let channel = norm % C;
+    let batch = norm / uniforms.x_shape[1];
+    let channel = norm % uniforms.x_shape[1];
     let localIndex = local_id.x;
 
     // initialize workgroup memory
     var initial = ${f32Type}(0);
-    for (var h = localIndex; h < ${normPackedSize}; h += workgroupSize) {
+    for (var h = localIndex; h < uniforms.normPackedSize; h += workgroupSize) {
       initial = initial + ${f32Type}(${x.get('batch', 'channel', 'h')});
     }
     workgroupShared[localIndex] = initial;
@@ -69,13 +67,13 @@ const createInstanceNormProgramInfo =
       workgroupBarrier();
     }
     if (localIndex == 0) {
-      meanShared = ${sumVector('workgroupShared[0]', components)} / f32(normSize);
+      meanShared = ${sumVector('workgroupShared[0]', components)} / f32(uniforms.normSize);
     }
     workgroupBarrier();
 
     // reinitialize workgroup memory.
     initial = ${f32Type}(0);
-    for (var h = localIndex; h < ${normPackedSize}; h += workgroupSize) {
+    for (var h = localIndex; h < uniforms.normPackedSize; h += workgroupSize) {
       let deviation =  ${f32Type}(${x.get('batch', 'channel', 'h')}) - ${f32Type}(meanShared);
       initial = initial + deviation * deviation;
     }
@@ -94,23 +92,26 @@ const createInstanceNormProgramInfo =
     }
     workgroupBarrier();
 
-    let invStdDev = 1 / sqrt(squaredNormShared / f32(normSize) + epsilon);
+    let invStdDev = 1 / sqrt(squaredNormShared / f32(uniforms.normSize) + f32(${attributes.epsilon}));
     let channelScale = invStdDev * f32(${scale.getByOffset('channel')});
     let channelShift = f32(${bias.getByOffset('channel')}) - meanShared * channelScale;
-    for (var h = localIndex; h < ${normPackedSize}; h += workgroupSize) {
+    for (var h = localIndex; h < uniforms.normPackedSize; h += workgroupSize) {
       let value = ${x.get('batch', 'channel', 'h')} * ${dataType}(${f32Type}(channelScale)) + ${dataType}(${
-          f32Type}(channelShift));
+            f32Type}(channelShift));
       ${output.set('batch', 'channel', 'h', 'value')};
     }
   }`;
+      };
       return {
-        ...metadata,
-        shaderCache: {hint: attributes.cacheKey},
+        ...{name: 'InstanceNormalization'},
+        // TODO: use epsilon as uniform. Currently epsilon as uniform fails test_instancenorm_epsilon.
+        shaderCache: {hint: `${attributes.epsilon};${components}`, inputDependencies},
         getRunData: () => ({
           outputs: [
             {dims: outputShape, dataType: inputs[0].dataType},
           ],
-          dispatchGroup: {x: normCount}
+          dispatchGroup: {x: normCount},
+          programUniforms
         }),
         getShaderSource,
       };
@@ -120,10 +121,6 @@ const computeMean =
     (context: ComputeContext, input: TensorView, scale: TensorView, bias: TensorView, n: number, h: number, c: number,
      epsilon: number) => {
       const components = getMaxComponents(c);
-      const inputHelper = inputVariable('input', input.dataType, input.dims, components);
-      const scaleHelper = inputVariable('scale', scale.dataType, scale.dims, components);
-      const biasHelper = inputVariable('bias', bias.dataType, bias.dims, components);
-
       const WG = 64;
       // we will store channel scale and channel shift in [2, components] matrix
       // or in vec2 when components == 1
@@ -133,65 +130,79 @@ const computeMean =
       const unitsOfWork = n * c / components;
       const wgSize = Math.ceil(h / WG);
 
-      const getMeanShaderSource = (shaderHelper: ShaderHelper) => `
-  const H: u32 = ${h};
-  const C: u32 = ${c / components};
-  const imageSize: u32 = ${h * c / components};
+      const meanInputDependencies: ProgramInputTensorInfoDependency[] = ['type'];
+      const meanProgramUniforms: ProgramUniform[] = [
+        {type: 'uint32', data: wgSize}, {type: 'uint32', data: h}, {type: 'uint32', data: Math.floor(c / components)},
+        {type: 'uint32', data: Math.floor(h * c / components)}
+      ];
 
+      const getMeanShaderSource = (shaderHelper: ShaderHelper) => {
+        const inputHelper = inputVariable('input', input.dataType, input.dims, components);
+        return `
   ${shaderHelper.declareVariables(inputHelper)}
   @group(0) @binding(1) var<storage, read_write> output : array<${outputType}>;
+  struct Uniforms {wg_size:u32, H:u32, C:u32, image_size:u32};
+  @group(0) @binding(2) var<uniform> uniforms: Uniforms;
 
   ${shaderHelper.mainStart(WG)}
-    let currentImageNumber = global_idx / ${WG} / C;
-    let currentChannelNumber = (global_idx / ${WG}) % C;
+    let currentImageNumber = global_idx / ${WG} / uniforms.C;
+    let currentChannelNumber = (global_idx / ${WG}) % uniforms.C;
     let wgId = global_idx % ${WG};
-    let wgOffset = wgId * ${wgSize};
-    if (wgOffset >= H) {
+    let wgOffset = wgId * uniforms.wg_size;
+    if (wgOffset >= uniforms.H) {
         return;
     }
-    let wgMax = min(wgOffset + ${wgSize}, H);
+    let wgMax = min(wgOffset + uniforms.wg_size, uniforms.H);
 
-    let offset = currentImageNumber * imageSize + currentChannelNumber;
+    let offset = currentImageNumber * uniforms.image_size + currentChannelNumber;
     var sum = ${fillVector('f32', components)};
     var squaredSum = ${fillVector('f32', components)};
     for (var i: u32 = wgOffset; i < wgMax; i++) {
-        let value = ${sumCastType}(input[offset + i * C]);
+        let value = ${sumCastType}(input[offset + i * uniforms.C]);
         sum += value;
         squaredSum += value * value;
     }
     output[global_idx] = ${setOutputValue('sum', 'squaredSum')};
   }`;
+      };
 
       const meanValues = context.compute(
           {
             name: 'InstanceNormComputeMean',
-            shaderCache: {hint: JSON.stringify({components, n, h, c})},
+            shaderCache: {hint: `${components}`, inputDependencies: meanInputDependencies},
             getRunData: () => ({
               outputs: [
                 {dims: [n, c, WG, 2], dataType: DataType.float},
               ],
               dispatchGroup: {x: n * c / components},
+              programUniforms: meanProgramUniforms
             }),
             getShaderSource: getMeanShaderSource,
           },
           {inputs: [input], outputs: [-1]})[0];
-      const getShaderSource = (shaderHelper: ShaderHelper) => `
-  const H: u32 = ${h};
-  const C: u32 = ${c / components};
-  const imageSize: u32 = ${WG * c / components};
-  const epsilon: f32 = ${epsilon};
 
+      const programUniforms: ProgramUniform[] = [
+        {type: 'uint32', data: unitsOfWork}, {type: 'uint32', data: h},
+        {type: 'uint32', data: Math.floor(c / components)}, {type: 'uint32', data: Math.floor(WG * c / components)}
+      ];
+      const inputDependencies: ProgramInputTensorInfoDependency[] = ['type', 'type', 'type'];
+      const getShaderSource = (shaderHelper: ShaderHelper) => {
+        const scaleHelper = inputVariable('scale', scale.dataType, scale.dims, components);
+        const biasHelper = inputVariable('bias', bias.dataType, bias.dims, components);
+        return `
   @group(0) @binding(0) var<storage, read> input : array<${outputType}>;
   @group(0) @binding(1) var<storage, read> scale : array<${scaleHelper.type.storage}>;
   @group(0) @binding(2) var<storage, read> bias : array<${biasHelper.type.storage}>;
   @group(0) @binding(3) var<storage, read_write> output : array<${outputType}>;
+  struct Uniforms {units_of_work : u32, H: u32, C : u32, image_size : u32};
+  @group(0) @binding(4) var<uniform> uniforms: Uniforms;
 
   ${shaderHelper.mainStart()}
-    ${shaderHelper.guardAgainstOutOfBoundsWorkgroupSizes(unitsOfWork)}
-    let currentImageNumber = global_idx / C;
-    let currentChannelNumber = global_idx % C;
+    ${shaderHelper.guardAgainstOutOfBoundsWorkgroupSizes('uniforms.units_of_work')}
+    let currentImageNumber = global_idx / uniforms.C;
+    let currentChannelNumber = global_idx % uniforms.C;
 
-    let offset = currentImageNumber * imageSize;
+    let offset = currentImageNumber * uniforms.image_size;
     var sum = ${fillVector('f32', components)};
     var squaredSum = ${fillVector('f32', components)};
     for (var i: u32 = 0; i < ${WG}; i++) {
@@ -199,24 +210,26 @@ const computeMean =
         sum += value[0];
         squaredSum += value[1];
     }
-    sum = sum / f32(H);
-    squaredSum = squaredSum / f32(H);
-    let invStdDev = 1 / sqrt(squaredSum - sum * sum + epsilon);
+    sum = sum / f32(uniforms.H);
+    squaredSum = squaredSum / f32(uniforms.H);
+    let invStdDev = 1 / sqrt(squaredSum - sum * sum + f32(${epsilon}));
     let channelScale = invStdDev * ${sumCastType}(scale[currentChannelNumber]);
     let channelShift = ${sumCastType}(bias[currentChannelNumber]) - sum * channelScale;
 
     output[global_idx] = ${setOutputValue('channelScale', 'channelShift')};
   }`;
-
+      };
       return context.compute(
           {
             name: 'InstanceNormComputeChannelScaleShift',
-            shaderCache: {hint: JSON.stringify({components, n, h, c, epsilon})},
+            // TODO: use epsilon as uniform. Currently epsilon as uniform fails test_instancenorm_epsilon.
+            shaderCache: {hint: `${components};${epsilon}`, inputDependencies},
             getRunData: () => ({
               outputs: [
                 {dims: [n, c, 2], dataType: DataType.float},
               ],
               dispatchGroup: {x: Math.ceil(unitsOfWork / 64 /* workgroup size */)},
+              programUniforms
             }),
             getShaderSource,
           },
@@ -230,50 +243,51 @@ const createInstanceNormNHWCProgramInfo =
       const N = xShape[0];
       const C = xShape[xShape.length - 1];
       const H = ShapeUtil.sizeFromDimension(xShape, 1) / C;
-
       const components = getMaxComponents(C);
       const outputSize = ShapeUtil.size(outputShape) / components;
-      const inputHelper = inputVariable('input', inputs[0].dataType, inputs[0].dims, components);
-      const outputHelper = outputVariable('output', inputs[0].dataType, outputShape, components);
-
-      const dataType = tensorTypeToWsglStorageType(inputs[0].dataType);
-      const scaleType = components === 1 ? 'vec2f' : `mat2x${components}f`;
-      const scaleCastType = components === 1 ? dataType : `vec${components}<${dataType}>`;
+      const programUniforms: ProgramUniform[] =
+          [{type: 'uint32', data: H}, {type: 'uint32', data: Math.floor(C / components)}];
+      const inputDependencies: ProgramInputTensorInfoDependency[] = ['type', 'type'];
       // first compute mean
       const channelScaleShift = computeMean(context, inputs[0], inputs[1], inputs[2], N, H, C, attributes.epsilon);
+      const getShaderSource = (shaderHelper: ShaderHelper) => {
+        const dataType = tensorTypeToWsglStorageType(inputs[0].dataType);
+        const scaleType = components === 1 ? 'vec2f' : `mat2x${components}f`;
+        const scaleCastType = components === 1 ? dataType : `vec${components}<${dataType}>`;
 
-      const getShaderSource = (shaderHelper: ShaderHelper) => `
-  const H: u32 = ${H};
-  const C: u32 = ${C / components};
+        const inputHelper = inputVariable('input', inputs[0].dataType, inputs[0].dims, components);
+        const outputHelper = outputVariable('output', inputs[0].dataType, outputShape, components);
 
+        return `
   @group(0) @binding(0) var<storage, read> input : array<${inputHelper.type.storage}>;
   @group(0) @binding(1) var<storage, read> scaleInput : array<${scaleType}>;
   @group(0) @binding(2) var<storage, read_write> output : array<${outputHelper.type.storage}>;
+  struct Uniforms {H: u32, C : u32};
+  @group(0) @binding(3) var<uniform> uniforms: Uniforms;
 
   ${shaderHelper.mainStart()}
-    let currentImageNumber = global_idx / (C * H);
-    let currentChannelNumber = global_idx % C;
+    let currentImageNumber = global_idx / (uniforms.C * uniforms.H);
+    let currentChannelNumber = global_idx % uniforms.C;
 
-    let scaleOffset = currentImageNumber * C + currentChannelNumber;
+    let scaleOffset = currentImageNumber * uniforms.C + currentChannelNumber;
     let scale = scaleInput[scaleOffset];
     output[global_idx] = fma(input[global_idx], ${scaleCastType}(scale[0]), ${scaleCastType}(scale[1]));
   }`;
+      };
       context.compute(
           {
-            name: 'InstanceNormalization',
-            shaderCache: {hint: `${attributes.cacheKey}`},
+            name: 'InstanceNormalizationNHWC',
+            shaderCache: {hint: `${components}`, inputDependencies},
             getRunData: () => ({
               outputs: [{dims: outputShape, dataType: inputs[0].dataType}],
-              dispatchGroup: {x: Math.ceil(outputSize / 64 /* workgroup size */)}
+              dispatchGroup: {x: Math.ceil(outputSize / 64 /* workgroup size */)},
+              programUniforms
             }),
             getShaderSource,
           },
           {inputs: [inputs[0], channelScaleShift]});
     };
 
-export const parseInstanceNormAttributes = (attributes: InstanceNormAttributes): InstanceNormAttributes =>
-    createAttributeWithCacheKey({epsilon: attributes.epsilon, format: attributes.format});
-
 export const instanceNorm = (context: ComputeContext, attributes: InstanceNormAttributes): void => {
   if (attributes.format === 'NHWC') {
     createInstanceNormNHWCProgramInfo(context, context.inputs, attributes);
diff --git a/js/web/test/data/ops/instance-norm.jsonc b/js/web/test/data/ops/instance-norm.jsonc
index 6a4e6912405ee..e89ac2da3795f 100644
--- a/js/web/test/data/ops/instance-norm.jsonc
+++ b/js/web/test/data/ops/instance-norm.jsonc
@@ -38,6 +38,79 @@
       }
     ]
   },
+  {
+    "name": "Simple test with NHWC, components 1",
+    "operator": "InstanceNormalization",
+    "inputShapeDefinitions": "rankOnly",
+    "opset": { "domain": "", "version": 17 },
+    "cases": [
+      {
+        "name": "Simple test",
+        "inputs": [
+          {
+            "data": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 9, 8, 7, 6, 5],
+            "dims": [1, 5, 3, 1],
+            "type": "float32"
+          },
+          {
+            "data": [1, 2, 3, 4, 5],
+            "dims": [5],
+            "type": "float32"
+          },
+          {
+            "data": [4, 5, 6, 7, 8],
+            "dims": [5],
+            "type": "float32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [
+              2.775264263153076, 4, 5.224735260009766, 2.5505285263061523, 5, 7.449470520019531, 2.325794219970703, 6,
+              9.674205780029297, 11.898944854736328, 7, 2.1010589599609375, 14.123676300048828, 8, 1.876321792602539
+            ],
+            "dims": [1, 5, 3, 1],
+            "type": "float32"
+          }
+        ]
+      }
+    ]
+  },
+  {
+    "name": "Simple test with NHWC, components 2",
+    "operator": "InstanceNormalization",
+    "inputShapeDefinitions": "rankOnly",
+    "opset": { "domain": "", "version": 17 },
+    "cases": [
+      {
+        "name": "Simple test",
+        "inputs": [
+          {
+            "data": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 9, 8],
+            "dims": [2, 6, 1, 1],
+            "type": "float32"
+          },
+          {
+            "data": [1, 2, 3, 4, 5, 6],
+            "dims": [6],
+            "type": "float32"
+          },
+          {
+            "data": [4, 5, 6, 7, 8, 9],
+            "dims": [6],
+            "type": "float32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [4, 5, 6, 7, 8, 9, 4, 5, 6, 7, 8, 9],
+            "dims": [2, 6, 1, 1],
+            "type": "float32"
+          }
+        ]
+      }
+    ]
+  },
   {
     "name": "Simple test with NCHW",
     "operator": "InstanceNormalization",
@@ -75,5 +148,81 @@
         ]
       }
     ]
+  },
+  {
+    "name": "Simple test with NCHW, components 1",
+    "operator": "InstanceNormalization",
+    "opset": { "domain": "", "version": 17 },
+    "cases": [
+      {
+        "name": "Simple test",
+        "inputs": [
+          {
+            "data": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 9, 8, 7, 6, 5],
+            "dims": [1, 5, 3, 1],
+            "type": "float32"
+          },
+          {
+            "data": [1, 2, 3, 4, 5],
+            "dims": [5],
+            "type": "float32"
+          },
+          {
+            "data": [4, 5, 6, 7, 8],
+            "dims": [5],
+            "type": "float32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [
+              2.775264263153076, 4, 5.224735260009766, 2.5505285263061523, 5, 7.449470520019531, 2.325794219970703, 6,
+              9.674205780029297, 11.898944854736328, 7, 2.1010589599609375, 14.123676300048828, 8, 1.876321792602539
+            ],
+            "dims": [1, 5, 3, 1],
+            "type": "float32"
+          }
+        ]
+      }
+    ]
+  },
+  {
+    "name": "Simple test with NCHW, components 2",
+    "operator": "InstanceNormalization",
+    "opset": { "domain": "", "version": 17 },
+    "cases": [
+      {
+        "name": "Simple test",
+        "inputs": [
+          {
+            "data": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 9, 8, 7, 6, 5, 4, 3, 2],
+            "dims": [1, 3, 6, 1],
+            "type": "float32"
+          },
+          {
+            "data": [1, 2, 3],
+            "dims": [3],
+            "type": "float32"
+          },
+          {
+            "data": [4, 5, 6],
+            "dims": [3],
+            "type": "float32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [
+              2.5361523628234863, 3.1216912269592285, 3.70723032951355, 4.292769432067871, 4.878308296203613,
+              5.4638471603393555, 1.8666191101074219, 3.9555397033691406, 6.044460296630859, 8.133380889892578,
+              6.044460296630859, 3.9555397033691406, 10.3915433883667, 8.634925842285156, 6.878308296203613,
+              5.121691703796387, 3.365074634552002, 1.6084575653076172
+            ],
+            "dims": [1, 3, 6, 1],
+            "type": "float32"
+          }
+        ]
+      }
+    ]
   }
 ]

From fa14dcd2b66d8e1f45a78a81f71276770f10aadf Mon Sep 17 00:00:00 2001
From: Wanming Lin <wanming.lin@intel.com>
Date: Wed, 10 Jan 2024 07:07:54 +0800
Subject: [PATCH 20/29] [WebNN EP] Support subgraph of the control flow nodes
 (#18923)

This PR also makes some processing on the subgraph's initializers. The
subgraph doesn't contain all its required initializers, some common
initializers are stored in its ancestor graphs. We need to collect all
required initializers and re-map to the subgraph.
---
 .../core/providers/webnn/builders/helper.cc   | 18 +++++++++++++
 .../core/providers/webnn/builders/helper.h    |  3 +++
 .../providers/webnn/builders/model_builder.cc | 19 ++++++++++++++
 .../providers/webnn/builders/model_builder.h  |  2 +-
 .../webnn/webnn_execution_provider.cc         | 26 +++++++++++++++----
 5 files changed, 62 insertions(+), 6 deletions(-)

diff --git a/onnxruntime/core/providers/webnn/builders/helper.cc b/onnxruntime/core/providers/webnn/builders/helper.cc
index d34cb7e362446..7718fbdc2df88 100644
--- a/onnxruntime/core/providers/webnn/builders/helper.cc
+++ b/onnxruntime/core/providers/webnn/builders/helper.cc
@@ -12,6 +12,24 @@
 namespace onnxruntime {
 namespace webnn {
 
+InitializedTensorSet CollectAllInitializedTensors(const GraphViewer& graph_viewer) {
+  InitializedTensorSet all_initializers;
+  if (graph_viewer.IsSubgraph()) {
+    const Graph* cur_graph = &graph_viewer.GetGraph();
+    // Traverse up to the top-level graph, collecting all initializers.
+    while (cur_graph->IsSubgraph()) {
+      const auto& current_initializers = cur_graph->GetAllInitializedTensors();
+      all_initializers.insert(current_initializers.begin(), current_initializers.end());
+      cur_graph = cur_graph->ParentGraph();
+    }
+    // Collect initializers in top-level graph.
+    const auto& current_initializers = cur_graph->GetAllInitializedTensors();
+    all_initializers.insert(current_initializers.begin(), current_initializers.end());
+  }
+
+  return all_initializers;
+}
+
 bool GetShape(const NodeArg& node_arg, std::vector<int64_t>& shape, const logging::Logger& logger) {
   const auto* shape_proto = node_arg.Shape();
   if (!shape_proto) {
diff --git a/onnxruntime/core/providers/webnn/builders/helper.h b/onnxruntime/core/providers/webnn/builders/helper.h
index 5aec81af15761..ea57ab1af19af 100644
--- a/onnxruntime/core/providers/webnn/builders/helper.h
+++ b/onnxruntime/core/providers/webnn/builders/helper.h
@@ -35,6 +35,9 @@ typedef struct {
   bool isCpuSupported;  // The WebNN CPU backend XNNPack supports it (not about the CPU EP).
 } WebnnOpInfo;
 
+// Collects all the initializer tensors in the subGraph and its ancestor graphs.
+InitializedTensorSet CollectAllInitializedTensors(const GraphViewer& graph_viewer);
+
 bool GetShape(const NodeArg& node_arg, std::vector<int64_t>& shape, const logging::Logger& logger);
 
 template <typename T>
diff --git a/onnxruntime/core/providers/webnn/builders/model_builder.cc b/onnxruntime/core/providers/webnn/builders/model_builder.cc
index b6631263dfb93..b57e1b89b0af0 100644
--- a/onnxruntime/core/providers/webnn/builders/model_builder.cc
+++ b/onnxruntime/core/providers/webnn/builders/model_builder.cc
@@ -38,6 +38,25 @@ Status ModelBuilder::Initialize() {
   return Status::OK();
 }
 
+InitializedTensorSet ModelBuilder::GetInitializerTensors() {
+  if (graph_viewer_.IsSubgraph()) {
+    auto all_initializers = CollectAllInitializedTensors(graph_viewer_);
+    const auto sub_graph_id = graph_viewer_.GetFilterInfo();
+    const auto subgraph_initializer_names = sub_graph_id->GetMetaDef()->constant_initializers;
+    InitializedTensorSet subgraph_initializers;
+
+    for (const auto& name : subgraph_initializer_names) {
+      auto it = all_initializers.find(name);
+      if (it != all_initializers.end()) {
+        subgraph_initializers.insert(*it);
+      }
+    }
+    return subgraph_initializers;
+  } else {
+    return graph_viewer_.GetAllInitializedTensors();
+  }
+}
+
 /* static */ const IOpBuilder* ModelBuilder::GetOpBuilder(const Node& node) {
   const auto& op_builders = GetOpBuilders();
   const auto it = op_builders.find(node.OpType());
diff --git a/onnxruntime/core/providers/webnn/builders/model_builder.h b/onnxruntime/core/providers/webnn/builders/model_builder.h
index c381eef3f42f7..16c8bf2d3c77f 100644
--- a/onnxruntime/core/providers/webnn/builders/model_builder.h
+++ b/onnxruntime/core/providers/webnn/builders/model_builder.h
@@ -30,7 +30,7 @@ class ModelBuilder {
 
   // Accessors for members.
   const GraphViewer& GetGraphViewer() const { return graph_viewer_; }
-  const InitializedTensorSet& GetInitializerTensors() const { return graph_viewer_.GetAllInitializedTensors(); }
+  InitializedTensorSet GetInitializerTensors();
 
   const emscripten::val& GetBuilder() const { return wnn_builder_; }
   const emscripten::val& GetContext() const { return wnn_context_; }
diff --git a/onnxruntime/core/providers/webnn/webnn_execution_provider.cc b/onnxruntime/core/providers/webnn/webnn_execution_provider.cc
index 4da54aaad3a33..cf18b3225eb47 100644
--- a/onnxruntime/core/providers/webnn/webnn_execution_provider.cc
+++ b/onnxruntime/core/providers/webnn/webnn_execution_provider.cc
@@ -59,10 +59,15 @@ WebNNExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph_view
                                       const IKernelLookup& /*kernel_registries*/) const {
   std::vector<std::unique_ptr<ComputeCapability>> result;
 
-  // We do not run WebNN EP on subgraph, instead we cover this in the control flow nodes.
-  // TODO investigate whether we want to support subgraph using WebNN EP.
-  if (graph_viewer.IsSubgraph()) {
-    return result;
+  // For subgraph which is the attribute of the control flow nodes, part of its initializers are stored in its
+  // ancestor graphs as common initializers shared for other subgraphs. We need to collect all of them used for
+  // identifying the required initializer names and storing into 'meta_def->constant_initializers'.
+  // Thus we are able to get the required initialized tensors for this subgraph via the GetInitializerTensors()
+  // method defined in the model_builder.h file.
+  InitializedTensorSet all_initializers;
+  const bool is_subgraph = graph_viewer.IsSubgraph();
+  if (is_subgraph) {
+    all_initializers = webnn::CollectAllInitializedTensors(graph_viewer);
   }
 
   /*
@@ -110,6 +115,7 @@ WebNNExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph_view
 
     std::unique_ptr<IndexedSubGraph> sub_graph = std::make_unique<IndexedSubGraph>();
 
+    std::vector<std::string> subgraph_initializers;
     InlinedHashSet<const NodeArg*> node_outputs;
     InlinedHashSet<const NodeArg*> subgraph_inputs;
     InlinedHashSet<const NodeArg*> subgraph_outputs;
@@ -126,7 +132,11 @@ WebNNExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph_view
           // skip the placeholder inputs.
           continue;
         }
-        // if the node input was not produced by this subgraph, add it to the subgraph inputs.
+        // If it is a subgraph of a control flow node, collect the constant initializer.
+        if (is_subgraph && Contains(all_initializers, input->Name())) {
+          subgraph_initializers.push_back(input->Name());
+        }
+        // If the node input was not produced by this subgraph, add it to the subgraph inputs.
         if (node_outputs.count(input) == 0) {
           if (subgraph_inputs.count(input) == 0) {
             subgraph_inputs.insert(input);
@@ -165,6 +175,12 @@ WebNNExecutionProvider::GetCapability(const onnxruntime::GraphViewer& graph_view
     meta_def->since_version = 1;
     meta_def->status = ONNX_NAMESPACE::EXPERIMENTAL;
 
+    if (is_subgraph) {
+      for (const auto& initializer : subgraph_initializers) {
+        meta_def->constant_initializers.push_back(initializer);
+      }
+    }
+
     for (const auto& input : ordered_subgraph_inputs) {
       meta_def->inputs.push_back(input->Name());
     }

From b25980c011f658709404f1b850302f94fc9541f4 Mon Sep 17 00:00:00 2001
From: Changming Sun <chasun@microsoft.com>
Date: Tue, 9 Jan 2024 17:09:31 -0800
Subject: [PATCH 21/29] Disable rust pipeline for now (#19067)

### Description
They are not working. When we have time to continue working on it, we
can restore them from git history.
---
 .../actions/rust-toolchain-setup/action.yml   |  44 ------
 .github/workflows/rust-ci.yml                 | 132 ------------------
 2 files changed, 176 deletions(-)
 delete mode 100644 .github/actions/rust-toolchain-setup/action.yml
 delete mode 100644 .github/workflows/rust-ci.yml

diff --git a/.github/actions/rust-toolchain-setup/action.yml b/.github/actions/rust-toolchain-setup/action.yml
deleted file mode 100644
index bf73fede16c7f..0000000000000
--- a/.github/actions/rust-toolchain-setup/action.yml
+++ /dev/null
@@ -1,44 +0,0 @@
-# yaml-language-server: $schema=https://json.schemastore.org/github-action.json
-
-name: 'Rust toolchain setup'
-description: 'Common setup steps for GitHub workflows for Rust projects'
-
-runs:
-  using: composite
-  steps:
-    - uses: dtolnay/rust-toolchain@1.71.0
-      with:
-        components: clippy, rustfmt
-    - uses: extractions/setup-just@v1
-      with:
-        just-version: '1.15.0' # optional semver specification, otherwise latest
-
-    ###
-    ### Linux setup
-    ###
-    - name: rustup
-      # We need to use the nightly rust tool change to enable registry-auth / to connect to ADO feeds.
-      if: ${{ (runner.os == 'Linux') }}
-      run: |
-        rustup set profile minimal
-        rustup install
-      shell: bash
-    # - name: Cargo login
-    #   if: ${{ (runner.os == 'Linux') }}
-    #   run: just cargo-login-ci
-    #   shell: bash
-
-      ###
-      ### Windows setup
-      ###
-    - name: rustup
-      # We need to use the nightly rust tool change to enable registry-auth / to connect to ADO feeds.
-      if: ${{ (runner.os == 'Windows') }}
-      run: |
-        rustup set profile minimal
-        rustup install
-      shell: pwsh
-    # - name: Cargo login
-    #   if: ${{ (runner.os == 'Windows') }}
-    #   run: just cargo-login-ci-windows
-    #   shell: pwsh
diff --git a/.github/workflows/rust-ci.yml b/.github/workflows/rust-ci.yml
deleted file mode 100644
index 725c40c2ded53..0000000000000
--- a/.github/workflows/rust-ci.yml
+++ /dev/null
@@ -1,132 +0,0 @@
-name: Rust
-
-on: [pull_request]
-
-env:
-  CARGO_TERM_COLOR: always
-  RUST_LOG: onnxruntime=debug,onnxruntime-sys=debug
-  RUST_BACKTRACE: 1
-  MANIFEST_PATH: ${{ github.workspace }}/rust/Cargo.toml
-
-jobs:
-  fmt:
-    name: Rustfmt
-    runs-on: ubuntu-latest
-    steps:
-      - uses: actions/checkout@v4
-      - uses: ./.github/actions/rust-toolchain-setup
-      - name: vendor onnxruntime source
-        run: just vendor
-      - name: fmt
-        run: cargo fmt --all -- --check
-
-  download:
-    name: Download prebuilt ONNX Runtime archive from build.rs
-    runs-on: ubuntu-latest
-    env:
-      ORT_RUST_STRATEGY: download
-    steps:
-      - uses: actions/checkout@v4
-      - uses: ./.github/actions/rust-toolchain-setup
-      - run: rustup target install x86_64-unknown-linux-gnu
-      - run: rustup target install x86_64-apple-darwin
-      - run: rustup target install i686-pc-windows-msvc
-      - run: rustup target install x86_64-pc-windows-msvc
-      # ******************************************************************
-      - name: Download prebuilt archive (CPU, x86_64-unknown-linux-gnu)
-        run: cargo build --target x86_64-unknown-linux-gnu  --manifest-path ${{ env.MANIFEST_PATH }}
-      - name: Verify prebuilt archive downloaded (CPU, x86_64-unknown-linux-gnu)
-        run: ls -lh target/x86_64-unknown-linux-gnu/debug/build/onnxruntime-sys-*/out/onnxruntime-linux-x64-1.*.tgz
-      # ******************************************************************
-      - name: Download prebuilt archive (CPU, x86_64-apple-darwin)
-        run: cargo build --target x86_64-apple-darwin  --manifest-path ${{ env.MANIFEST_PATH }}
-      - name: Verify prebuilt archive downloaded (CPU, x86_64-apple-darwin)
-        run: ls -lh target/x86_64-apple-darwin/debug/build/onnxruntime-sys-*/out/onnxruntime-osx-x64-1.*.tgz
-      # ******************************************************************
-      - name: Download prebuilt archive (CPU, i686-pc-windows-msvc)
-        run: cargo build --target i686-pc-windows-msvc  --manifest-path ${{ env.MANIFEST_PATH }}
-      - name: Verify prebuilt archive downloaded (CPU, i686-pc-windows-msvc)
-        run: ls -lh target/i686-pc-windows-msvc/debug/build/onnxruntime-sys-*/out/onnxruntime-win-x86-1.*.zip
-      # ******************************************************************
-      - name: Download prebuilt archive (CPU, x86_64-pc-windows-msvc)
-        run: cargo build --target x86_64-pc-windows-msvc  --manifest-path ${{ env.MANIFEST_PATH }}
-      - name: Verify prebuilt archive downloaded (CPU, x86_64-pc-windows-msvc)
-        run: ls -lh target/x86_64-pc-windows-msvc/debug/build/onnxruntime-sys-*/out/onnxruntime-win-x64-1.*.zip
-      # ******************************************************************
-      - name: Download prebuilt archive (GPU, x86_64-unknown-linux-gnu)
-        env:
-          ORT_USE_CUDA: "yes"
-        run: cargo build --target x86_64-unknown-linux-gnu  --manifest-path ${{ env.MANIFEST_PATH }}
-      - name: Verify prebuilt archive downloaded (GPU, x86_64-unknown-linux-gnu)
-        run: ls -lh target/x86_64-unknown-linux-gnu/debug/build/onnxruntime-sys-*/out/onnxruntime-linux-x64-gpu-1.*.tgz
-      # ******************************************************************
-      - name: Download prebuilt archive (GPU, x86_64-pc-windows-msvc)
-        env:
-          ORT_USE_CUDA: "yes"
-        run: cargo build --target x86_64-pc-windows-msvc  --manifest-path ${{ env.MANIFEST_PATH }}
-      - name: Verify prebuilt archive downloaded (GPU, x86_64-pc-windows-msvc)
-        run: ls -lh target/x86_64-pc-windows-msvc/debug/build/onnxruntime-sys-*/out/onnxruntime-win-gpu-x64-1.*.zip
-
-  test:
-    name: Test Suite
-    runs-on: ${{ matrix.os }}
-    strategy:
-      fail-fast: false
-      matrix:
-        target:
-          [
-            x86_64-unknown-linux-gnu,
-            x86_64-apple-darwin,
-            x86_64-pc-windows-msvc,
-            i686-pc-windows-msvc,
-          ]
-        include:
-          - target: x86_64-unknown-linux-gnu
-            os: ubuntu-latest
-          - target: x86_64-apple-darwin
-            os: macos-latest
-          - target: x86_64-pc-windows-msvc
-            os: windows-latest
-          - target: i686-pc-windows-msvc
-            os: windows-latest
-    env:
-      CARGO_BUILD_TARGET: ${{ matrix.target }}
-    steps:
-      - uses: actions/checkout@v4
-      - uses: ./.github/actions/rust-toolchain-setup
-      - name: vendor onnxruntime source
-        run: just vendor
-      - run: rustup target install ${{ matrix.target }}
-      - name: Install additional packages (macOS)
-        if: contains(matrix.target, 'x86_64-apple-darwin')
-        run: brew install libomp
-      - name: Build (cargo build)
-        run: cargo build --all --manifest-path ${{ env.MANIFEST_PATH }}
-      - name: Build tests (cargo test)
-        run: cargo test --no-run --manifest-path ${{ env.MANIFEST_PATH }}
-      - name: Build onnxruntime with 'model-fetching' feature
-        run: cargo build --manifest-path ${{ env.MANIFEST_PATH }} --features model-fetching
-      - name: Test onnxruntime-sys
-        run: cargo build --package onnxruntime-sys -- --test-threads=1 --nocapture
-      - name: Test onnxruntime
-        run: cargo test --manifest-path ${{ env.MANIFEST_PATH }} --features model-fetching -- --test-threads=1 --nocapture
-
-  clippy:
-    name: Clippy
-    runs-on: ubuntu-latest
-    steps:
-      - uses: actions/checkout@v4
-      - uses: ./.github/actions/rust-toolchain-setup
-      - name: vendor onnxruntime source
-        run: just vendor
-      - run: clippy --all-features --manifest-path ${{ env.MANIFEST_PATH }} -- -D warnings
-
-  package-sys:
-    name: Package onnxruntime-sys
-    runs-on: ubuntu-latest
-    steps:
-      - uses: actions/checkout@v4
-      - uses: ./.github/actions/rust-toolchain-setup
-      - name: vendor onnxruntime source
-        run: just vendor
-      - run: cargo package --allow-dirty --package onnxruntime-sys

From 730df1bfa284ffee7bc777de83b2c5e80c7cd98e Mon Sep 17 00:00:00 2001
From: Baiju Meswani <bmeswani@microsoft.com>
Date: Tue, 9 Jan 2024 18:35:21 -0800
Subject: [PATCH 22/29] Increase MacOS pipeline timeout (#19072)

---
 .../azure-pipelines/orttraining-py-packaging-pipeline-cpu.yml | 2 +-
 .../github/azure-pipelines/templates/py-packaging-stage.yml   | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/tools/ci_build/github/azure-pipelines/orttraining-py-packaging-pipeline-cpu.yml b/tools/ci_build/github/azure-pipelines/orttraining-py-packaging-pipeline-cpu.yml
index 9755e1f0771ba..693a06f9844f5 100644
--- a/tools/ci_build/github/azure-pipelines/orttraining-py-packaging-pipeline-cpu.yml
+++ b/tools/ci_build/github/azure-pipelines/orttraining-py-packaging-pipeline-cpu.yml
@@ -13,7 +13,7 @@ stages:
 
   jobs:
     - job: Linux_Training_CPU_Wheels
-      timeoutInMinutes: 120
+      timeoutInMinutes: 180
       workspace:
         clean: all
       pool: onnxruntime-Ubuntu2004-AMD-CPU
diff --git a/tools/ci_build/github/azure-pipelines/templates/py-packaging-stage.yml b/tools/ci_build/github/azure-pipelines/templates/py-packaging-stage.yml
index 44904f9248b10..7cee5045bc4f3 100644
--- a/tools/ci_build/github/azure-pipelines/templates/py-packaging-stage.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/py-packaging-stage.yml
@@ -102,7 +102,7 @@ stages:
         setVcvars: true
         BuildConfig: 'RelWithDebInfo'
         ExtraParam: ${{ parameters.build_py_parameters }}
-      timeoutInMinutes: 120
+      timeoutInMinutes: 180
       workspace:
         clean: all
 
@@ -329,7 +329,7 @@ stages:
 
   - ${{ if eq(parameters.enable_mac_cpu, true) }}:
     - job: MacOS_py_Wheels
-      timeoutInMinutes: 120
+      timeoutInMinutes: 180
       workspace:
         clean: all
       pool:

From ed0f26d3d4d72d1a449f759ee27bebf128633911 Mon Sep 17 00:00:00 2001
From: Xu Xing <xing.xu@intel.com>
Date: Wed, 10 Jan 2024 12:58:50 +0800
Subject: [PATCH 23/29] [js/webgpu] Revert parse norm attributes (#19074)

This resolves the below build errors:
```
lib/wasm/jsep/webgpu/op-resolve-rules.ts:19:23 - error TS2724: '"./ops/instance-norm"' has no exported member named 'parseInstanceNormAttributes'. Did you mean 'InstanceNormAttributes'?

19 import {instanceNorm, parseInstanceNormAttributes} from './ops/instance-norm';
                         ~~~~~~~~~~~~~~~~~~~~~~~~~~~

lib/wasm/jsep/webgpu/op-resolve-rules.ts:19:23 - error TS6133: 'parseInstanceNormAttributes' is declared but its value is never read.

19 import {instanceNorm, parseInstanceNormAttributes} from './ops/instance-norm';
                         ~~~~~~~~~~~~~~~~~~~~~~~~~~~

lib/wasm/jsep/webgpu/op-resolve-rules.ts:20:20 - error TS2305: Module '"./ops/layer-norm"' has no exported member 'parseLayerNormAttributes'.

20 import {layerNorm, parseLayerNormAttributes} from './ops/layer-norm';
                      ~~~~~~~~~~~~~~~~~~~~~~~~

lib/wasm/jsep/webgpu/op-resolve-rules.ts:20:20 - error TS6133: 'parseLayerNormAttributes' is declared but its value is never read.

20 import {layerNorm, parseLayerNormAttributes} from './ops/layer-norm';
```
---
 js/web/lib/wasm/jsep/webgpu/op-resolve-rules.ts | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/js/web/lib/wasm/jsep/webgpu/op-resolve-rules.ts b/js/web/lib/wasm/jsep/webgpu/op-resolve-rules.ts
index 286dfaf968211..90e02da986b8f 100644
--- a/js/web/lib/wasm/jsep/webgpu/op-resolve-rules.ts
+++ b/js/web/lib/wasm/jsep/webgpu/op-resolve-rules.ts
@@ -16,8 +16,8 @@ import {expand} from './ops/expand';
 import {gather, parseGatherAttributes} from './ops/gather';
 import {gatherElements, parseGatherElementsAttributes} from './ops/gather-elements';
 import {gemm, parseGemmAttributes} from './ops/gemm';
-import {instanceNorm, parseInstanceNormAttributes} from './ops/instance-norm';
-import {layerNorm, parseLayerNormAttributes} from './ops/layer-norm';
+import {instanceNorm} from './ops/instance-norm';
+import {layerNorm} from './ops/layer-norm';
 import {matMul} from './ops/matmul';
 import {multiHeadAttention, parseMultiHeadAttentionAttributes} from './ops/multi-head-attentiion';
 import {pad} from './ops/pad';
@@ -82,8 +82,8 @@ export const WEBGPU_OP_RESOLVE_RULES: Map<string, OperatorImplementation> = new
   ['GlobalMaxPool', [pool.globalMaxPool, pool.parseGlobalMaxPoolAttributes]],
   ['Greater', [binaryOps.greater]],
   ['GreaterOrEqual', [binaryOps.greaterOrEqual]],
-  ['InstanceNormalization', [instanceNorm, parseInstanceNormAttributes]],
-  ['LayerNormalization', [layerNorm, parseLayerNormAttributes]],
+  ['InstanceNormalization', [instanceNorm]],
+  ['LayerNormalization', [layerNorm]],
   ['LeakyRelu', [unaryOps.leakyRelu, unaryOps.parseAlphaAttributes]],
   ['Less', [binaryOps.less]],
   ['LessOrEqual', [binaryOps.lessOrEqual]],

From 5f3113ecd612b4ef070d5a3c8a586d725cfdb13b Mon Sep 17 00:00:00 2001
From: PeixuanZuo <94887879+PeixuanZuo@users.noreply.github.com>
Date: Wed, 10 Jan 2024 14:49:19 +0800
Subject: [PATCH 24/29] [ROCm] Fix hipify error: fast_divmod.h: No such file or
 directory (#19060)

Fix error:
```
[ 48%] Built target onnxruntime_optimizer

In file included from /onnxruntime_src/onnxruntime/core/providers/rocm/rocm_stream_handle.cc:5:
/onnxruntime_src/onnxruntime/core/providers/rocm/rocm_common.h:11:10: fatal error: core/providers/rocm/shared_inc/fast_divmod.h: No such file or directory
   11 | #include "core/providers/rocm/shared_inc/fast_divmod.h"
      |          ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
compilation terminated.
```

This error is due to onnxruntime_optimizer missing dependencies on
hipify generated files.
---
 cmake/onnxruntime_optimizer.cmake | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/cmake/onnxruntime_optimizer.cmake b/cmake/onnxruntime_optimizer.cmake
index 6f09583199ffd..f15d5b8dd6f80 100644
--- a/cmake/onnxruntime_optimizer.cmake
+++ b/cmake/onnxruntime_optimizer.cmake
@@ -130,3 +130,7 @@ if (NOT onnxruntime_BUILD_SHARED_LIB)
             RUNTIME   DESTINATION ${CMAKE_INSTALL_BINDIR}
             FRAMEWORK DESTINATION ${CMAKE_INSTALL_BINDIR})
 endif()
+
+if (onnxruntime_USE_ROCM)
+  add_dependencies(onnxruntime_optimizer generate_hipified_files)
+endif()

From cf78d01546ca059a2ab487e01626e38029a3e8fd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Xavier=20Dupr=C3=A9?= <xadupre@users.noreply.github.com>
Date: Wed, 10 Jan 2024 16:36:50 +0100
Subject: [PATCH 25/29] remove use of ai.onnx.ml in test for custom ops and
 local functions (#19043)

### Description

QNN_Nuget_Windows does not allow ai.onnx.ml operators but the test
test_custom_op_local_function is using LabelEncoder. The operator can be
removed as the test is only checking custom ops api.

### Motivation and Context

Fix test test_custom_op_local_function in QNN_Nuget_Windows pipeline.
---
 .../custom_ops_type_inference_fails_0.onnx    | Bin 2086 -> 1977 bytes
 1 file changed, 0 insertions(+), 0 deletions(-)

diff --git a/onnxruntime/test/testdata/custom_op_local_function/custom_ops_type_inference_fails_0.onnx b/onnxruntime/test/testdata/custom_op_local_function/custom_ops_type_inference_fails_0.onnx
index 8116ec338064567cea06fafe45168567813071ed..3a43a7378a9123b1f51a30cc2270d2edaa1c0f76 100644
GIT binary patch
delta 512
zcmZ1`u#=yigVSmo3zzdmLH&u|pBcR-+cQe&8*;IhCFYc-YN>KC3$QyuI9?J=3=n`$
zPcCPiE-cK&7$L;R#lyiU#KFZR!~n#TWtmc0PB9*1oji@nbn<y71tFMGEFgmxFtS*g
zPiABml~;t>C(6Nyu#e+E6il{aR$x?~9L+4r#mU7~k}n~+fRSl(Bl8apbs?7Gl9V(h
z4(I&5;*!L?5-m|KZm`qi({l0?OGNu*PdF`LWSpGATFRu#HCdWb$xwjNON>h>CqFqc
zN3XCHs7Eg)KQ}QmPk=E9i|8rF*OQsqbn1<TSgKNs@{w%9;b>?WfqiEv#FT4DSPRUb
ipa21BF~Za$k*b9nHvgcSHt%5zV`TP%_)U*AzX1T^6p=>&

delta 679
zcmdnVzf3@ogF}cxib07%gTaWwY8?xg-$ZtO9xYBTrjmRK!3B&=R+AN3g(rI5x3m`G
zPA)Eq&&@5)NGwQI;&9H-D=taQE74NoVl7L|DNWT9<6stGw_;{uVsb)OI=PT>IzK-b
zV}uYF7Y7HU5DOOr*JNp?R8c2BF0RB(z5Kkq3ccJM0j5)ouh_UIDu_*<%_PTS1a!dU
zolFYeN-PXO;KW*$T9lu*fRV+@Mu;OXwH)0QYA{zwaWDd1!Sx>o7@gqCC(AP{Fse;<
zV3y?N0eKMUy;Y1Ldy<)d)T;<FmzCrr`Ae9K6YQ_}%)AnqN-Je9kSZ>?kwTn6Zb@oh
zaek2!dr&IaRaV77oiI@$uHurEG^h$5pTwlp9M`<${FKxpEeS5}?9|F)kojgNA;k`Y
z4nhvX4k8N}*|pTTc)@lT$EW4wCzcf3_Q{@bvfU>T>13<=SH#Ws|63uq1&nM~Na1It
z$Ax5-5DUoK$*-7YC%3Sa2%v>@5GYbvPcbfH;hH>=QD$;BD_^~~5DO@Xkpe&x7Nor3
yAZ2850;>SUfq@WHt|5|A1(;G1jL-)vgy}HCql51sBEpz*jW!Fgg)uUM-3b8JD$Ni8


From df116b82c743f9104bd090a310d5777a4475e539 Mon Sep 17 00:00:00 2001
From: RandySheriffH <48490400+RandySheriffH@users.noreply.github.com>
Date: Wed, 10 Jan 2024 14:13:25 -0800
Subject: [PATCH 26/29] Custom op API for thread pool  (#18980)

Allow custom op to invoke internal thread-pool for parallelism.

---------

Co-authored-by: Randy Shuai <rashuai@microsoft.com>
---
 .../core/session/onnxruntime_c_api.h          | 13 +++++++
 .../core/session/onnxruntime_cxx_api.h        |  1 +
 .../core/session/onnxruntime_cxx_inline.h     |  4 ++
 onnxruntime/core/session/custom_ops.cc        | 26 +++++++++++++
 onnxruntime/core/session/onnxruntime_c_api.cc |  1 +
 onnxruntime/core/session/ort_apis.h           |  2 +
 .../testdata/custom_op_library/cpu/cpu_ops.cc | 37 +++++++++++++++++--
 7 files changed, 80 insertions(+), 4 deletions(-)

diff --git a/include/onnxruntime/core/session/onnxruntime_c_api.h b/include/onnxruntime/core/session/onnxruntime_c_api.h
index 06fef6bf72cc9..8cd0d0051d1eb 100644
--- a/include/onnxruntime/core/session/onnxruntime_c_api.h
+++ b/include/onnxruntime/core/session/onnxruntime_c_api.h
@@ -4528,6 +4528,19 @@ struct OrtApi {
    * \since Version 1.17.
    */
   ORT_API2_STATUS(SetDeterministicCompute, _Inout_ OrtSessionOptions* options, bool value);
+
+  /**
+   * Run fn in parallel
+   *
+   * \param[in] context
+   * \param[in] fn Function accepting usr_data and an integer as iterator
+   * \param[in] total The number of times fn is to be invoked
+   * \param[in] num_batch Number of batches by which the "total" is to be divided in maximum. When zero, there is no limit
+   * \param[in] usr_data User data to be passed back to fn
+   *
+   * \since Version 1.17.
+   */
+  ORT_API2_STATUS(KernelContext_ParallelFor, _In_ const OrtKernelContext* context, _In_ void (*fn)(void*, size_t), _In_ size_t total, _In_ size_t num_batch, _In_ void* usr_data);
 };
 
 /*
diff --git a/include/onnxruntime/core/session/onnxruntime_cxx_api.h b/include/onnxruntime/core/session/onnxruntime_cxx_api.h
index 16d9451624533..3773a01cb65a8 100644
--- a/include/onnxruntime/core/session/onnxruntime_cxx_api.h
+++ b/include/onnxruntime/core/session/onnxruntime_cxx_api.h
@@ -2057,6 +2057,7 @@ struct KernelContext {
   Logger GetLogger() const;
   OrtAllocator* GetAllocator(const OrtMemoryInfo& memory_info) const;
   OrtKernelContext* GetOrtKernelContext() const { return ctx_; }
+  void ParallelFor(void (*fn)(void*, size_t), size_t total, size_t num_batch, void* usr_data) const;
 
  private:
   OrtKernelContext* ctx_;
diff --git a/include/onnxruntime/core/session/onnxruntime_cxx_inline.h b/include/onnxruntime/core/session/onnxruntime_cxx_inline.h
index 63e55603736b6..db4619eeeae62 100644
--- a/include/onnxruntime/core/session/onnxruntime_cxx_inline.h
+++ b/include/onnxruntime/core/session/onnxruntime_cxx_inline.h
@@ -1658,6 +1658,10 @@ inline Logger KernelContext::GetLogger() const {
   return Logger{out};
 }
 
+inline void KernelContext::ParallelFor(void (*fn)(void*, size_t), size_t total, size_t num_batch, void* usr_data) const {
+  ThrowOnError(GetApi().KernelContext_ParallelFor(ctx_, fn, total, num_batch, usr_data));
+}
+
 inline OpAttr::OpAttr(const char* name, const void* data, int len, OrtOpAttrType type) {
   Ort::ThrowOnError(GetApi().CreateOpAttr(name, data, len, type, &p_));
 }
diff --git a/onnxruntime/core/session/custom_ops.cc b/onnxruntime/core/session/custom_ops.cc
index eea675eb0193a..984fdd6bce325 100644
--- a/onnxruntime/core/session/custom_ops.cc
+++ b/onnxruntime/core/session/custom_ops.cc
@@ -24,6 +24,7 @@
 #include "core/session/custom_ops.h"
 #include "core/session/inference_session.h"
 #include "core/session/ort_apis.h"
+#include "core/platform/threadpool.h"
 
 #if !defined(ORT_MINIMAL_BUILD)
 static constexpr uint32_t min_ort_version_with_optional_io_support = 8;
@@ -380,6 +381,31 @@ ORT_API_STATUS_IMPL(OrtApis::KernelContext_GetResource, _In_ const OrtKernelCont
   API_IMPL_END
 };
 
+ORT_API_STATUS_IMPL(OrtApis::KernelContext_ParallelFor, _In_ const OrtKernelContext* context, _In_ void (*fn)(void*, size_t), _In_ size_t total, _In_ size_t num_batch, _In_ void* usr_data) {
+  API_IMPL_BEGIN
+  if (!context) {
+    return OrtApis::CreateStatus(ORT_RUNTIME_EXCEPTION, "Invalid context");
+  }
+  if (fn && total) {
+    const auto* ctx = reinterpret_cast<const onnxruntime::OpKernelContext*>(context);
+    auto* tp = ctx->GetOperatorThreadPool();
+    if (num_batch) {
+      onnxruntime::concurrency::ThreadPool::TryBatchParallelFor(
+          tp,
+          static_cast<std::ptrdiff_t>(total),
+          [fn, usr_data](std::ptrdiff_t ith) { fn(usr_data, static_cast<size_t>(ith)); },
+          static_cast<std::ptrdiff_t>(num_batch));
+    } else {
+      onnxruntime::concurrency::ThreadPool::TrySimpleParallelFor(
+          tp,
+          static_cast<std::ptrdiff_t>(total),
+          [fn, usr_data](std::ptrdiff_t ith) { fn(usr_data, static_cast<size_t>(ith)); });
+    }
+  }
+  return nullptr;
+  API_IMPL_END
+};
+
 #ifdef _WIN32
 #pragma warning(pop)
 #endif
diff --git a/onnxruntime/core/session/onnxruntime_c_api.cc b/onnxruntime/core/session/onnxruntime_c_api.cc
index 76a8a778025e1..08bfb618f55b4 100644
--- a/onnxruntime/core/session/onnxruntime_c_api.cc
+++ b/onnxruntime/core/session/onnxruntime_c_api.cc
@@ -2722,6 +2722,7 @@ static constexpr OrtApi ort_api_1_to_17 = {
     &OrtApis::SetSymbolicDimensions,
     &OrtApis::ReadOpAttr,
     &OrtApis::SetDeterministicCompute,
+    &OrtApis::KernelContext_ParallelFor,
 };
 
 // OrtApiBase can never change as there is no way to know what version of OrtApiBase is returned by OrtGetApiBase.
diff --git a/onnxruntime/core/session/ort_apis.h b/onnxruntime/core/session/ort_apis.h
index c9e4074a1afe2..6df5e4145b416 100644
--- a/onnxruntime/core/session/ort_apis.h
+++ b/onnxruntime/core/session/ort_apis.h
@@ -502,4 +502,6 @@ ORT_API_STATUS_IMPL(SetSymbolicDimensions, _In_ OrtTensorTypeAndShapeInfo* info,
 ORT_API_STATUS_IMPL(ReadOpAttr, _In_ const OrtOpAttr* op_attr, _In_ OrtOpAttrType type, _Inout_ void* data, _In_ size_t len, _Out_ size_t* out);
 ORT_API_STATUS_IMPL(SetDeterministicCompute, _Inout_ OrtSessionOptions* options, bool value);
 
+ORT_API_STATUS_IMPL(KernelContext_ParallelFor, _In_ const OrtKernelContext* context, _In_ void (*fn)(void*, size_t), _In_ size_t total, _In_ size_t num_batch, _In_ void* user_data);
+
 }  // namespace OrtApis
diff --git a/onnxruntime/test/testdata/custom_op_library/cpu/cpu_ops.cc b/onnxruntime/test/testdata/custom_op_library/cpu/cpu_ops.cc
index 85edfa0e59f1d..ebef441350d4c 100644
--- a/onnxruntime/test/testdata/custom_op_library/cpu/cpu_ops.cc
+++ b/onnxruntime/test/testdata/custom_op_library/cpu/cpu_ops.cc
@@ -49,16 +49,45 @@ struct KernelOne {
   }
 };
 
+struct DataI {
+  const float* from = {};
+  float* to = {};
+};
+
+struct DataII {
+  const float* from = {};
+  int32_t* to = {};
+};
+
+// floats to floats
+void CopyI(void* raw_data, size_t ith) {
+  auto data = reinterpret_cast<DataI*>(raw_data);
+  data->to[ith] = data->from[ith];
+}
+
+// floats to int32_t
+void CopyII(void* raw_data, size_t ith) {
+  auto data = reinterpret_cast<DataII*>(raw_data);
+  data->to[ith] = static_cast<int32_t>(round(data->from[ith]));
+}
+
 // lite custom op as a function
-void KernelTwo(const Ort::Custom::Tensor<float>& X,
+void KernelTwo(OrtKernelContext* context,
+               const Ort::Custom::Tensor<float>& X,
                Ort::Custom::Tensor<int32_t>& Y) {
   const auto& shape = X.Shape();
   auto X_raw = X.Data();
   auto Y_raw = Y.Allocate(shape);
+  std::vector<float> floats(static_cast<size_t>(X.NumberOfElement()), 0.f);
+
+  DataI data_i = {X_raw, floats.data()};
   auto total = std::accumulate(shape.begin(), shape.end(), 1LL, std::multiplies<int64_t>());
-  for (int64_t i = 0; i < total; i++) {
-    Y_raw[i] = static_cast<int32_t>(round(X_raw[i]));
-  }
+
+  Ort::KernelContext ctx(context);
+  ctx.ParallelFor(CopyI, static_cast<size_t>(total), 0, &data_i);  // test simple parallel for
+
+  DataII data_ii = {floats.data(), Y_raw};
+  ctx.ParallelFor(CopyII, static_cast<size_t>(total), 2, &data_ii);  // test batch parallel for
 }
 
 template <typename T>

From 731b50dfc4f8074185dc70f3a10236fa4fdfc0aa Mon Sep 17 00:00:00 2001
From: yuwenzho <yuwen.zhou@intel.com>
Date: Wed, 10 Jan 2024 15:13:04 -0800
Subject: [PATCH 27/29] Support INT4 weight only quantize, including RTN and
 GPTQ 2 algorithms (#17390)

### Description
Support INT4 weight only quantize (WOQ) via Intel Neural Compressor,
including RTN and GPTQ 2 algorithms.

**Note:**
Please install `neural-compressor==2.3` for weight only quantize.

### Motivation and Context
As large language models (LLMs) become more prevalent, there is a
growing need for new and improved quantization methods that can meet the
computational demands of these modern architectures while maintaining
the accuracy. Compared to normal quantization like W8A8, weight only
quantization is probably a better trade-off to balance the performance
and the accuracy.
RTN is the most straightforward way to quantize weight.
GPTQ algorithm provides more accurate quantization but requires more
computational resources.

### Evaluation results
The following table shows the accuracy results of Llama-2 models
evaluated on [lambada_openai](https://huggingface.co/datasets/lambada)
task. `GPTQ W4G32Asym` in configuration column means GPTQ algorithm is
used for 4-bit weight only quantization, setting group_size=32 and
scheme=asym.
<table class="tg">
<thead>
  <tr>
    <th rowspan="2">Model name</th>
    <th rowspan="2">Configuration</th>
    <th colspan="2">Lambada_openai</th>
    <th rowspan="2">Accuracy Ratio<br>[WOQ/FP32]</th>
  </tr>
  <tr>
    <th>Accuracy</th>
    <th>Perplexity</th>
  </tr>
</thead>
<tbody>
  <tr>
    <td rowspan="2">meta-llama/Llama-2-7b-chat-hf</td>
    <td>FP32</td>
    <td>0.7058</td>
    <td>3.2788</td>
    <td>/</td>
  </tr>
  <tr>
    <td>GPTQ<br>W4G32Asym</td>
    <td>0.7025</td>
    <td>3.4489</td>
    <td>99.53%</td>
  </tr>
  <tr>
    <td rowspan="2">meta-llama/Llama-2-7b-hf</td>
    <td>FP32</td>
    <td>0.7392</td>
    <td>3.3950</td>
    <td>/</td>
  </tr>
  <tr>
    <td>GPTQ<br>W4G32Asym</td>
    <td>0.7326</td>
    <td>3.5286</td>
    <td>99.11%</td>
  </tr>
  <tr>
    <td rowspan="2">meta-llama/Llama-2-13b-chat-hf</td>
    <td>FP32</td>
    <td>0.7312</td>
    <td>2.9163</td>
    <td>/</td>
  </tr>
  <tr>
    <td>GPTQ<br>W4G128Asym</td>
    <td>0.7289</td>
    <td>3.0061</td>
    <td>99.56%</td>
  <tr>
    <td rowspan="2">meta-llama/Llama-2-13b-hf</td>
    <td>FP32</td>
    <td>0.7677</td>
    <td>3.0438</td>
    <td>/</td>
  </tr>
  <tr>
    <td>GPTQ<br>W4G32Asym</td>
    <td>0.7607</td>
    <td>3.1562</td>
    <td>99.09%</td>
  </tr>
  <tr>
    <td rowspan="2">meta-llama/Llama-2-70b-chat-hf</td>
    <td>FP32</td>
    <td>0.7543</td>
    <td>2.6181</td>
    <td>/</td>
  </tr>
  <tr>
    <td>RTN<br>W4G32Sym</td>
    <td>0.7489</td>
    <td>2.6850</td>
    <td>99.28%</td>
  </tr>
  <tr>
    <td rowspan="2">meta-llama/Llama-2-70b-hf</td>
    <td>FP32</td>
    <td>0.7964</td>
    <td>2.6612</td>
    <td>/</td>
  </tr>
  <tr>
    <td>RTN<br>W4G32Sym</td>
    <td>0.7896</td>
    <td>2.7546</td>
    <td>99.15%</td>
  </tr>
</tbody>
</table>

---------

Signed-off-by: yuwenzho <yuwen.zhou@intel.com>
Co-authored-by: Wang, Mengni <mengni.wang@intel.com>
---
 .../quantization/matmul_4bits_quantizer.py    | 189 ++++++++++++++++--
 .../python/tools/quantization/quantize.py     |  11 +-
 .../quantization/test_op_matmul_4bits.py      |  70 ++++++-
 3 files changed, 246 insertions(+), 24 deletions(-)

diff --git a/onnxruntime/python/tools/quantization/matmul_4bits_quantizer.py b/onnxruntime/python/tools/quantization/matmul_4bits_quantizer.py
index 6293bcbbf95bd..3e9f9a6544a71 100644
--- a/onnxruntime/python/tools/quantization/matmul_4bits_quantizer.py
+++ b/onnxruntime/python/tools/quantization/matmul_4bits_quantizer.py
@@ -7,6 +7,8 @@
 from __future__ import annotations
 
 import argparse
+import copy
+import importlib
 import logging
 import os
 
@@ -14,9 +16,11 @@
 import numpy.typing as npt
 import onnx
 from onnx.onnx_pb import GraphProto, ModelProto, NodeProto, TensorProto
+from packaging import version
 
 from onnxruntime.capi._pybind_state import quantize_matmul_4bits
 
+from .calibrate import CalibrationDataReader
 from .onnx_model import ONNXModel
 from .quant_utils import attribute_to_kwarg
 
@@ -24,24 +28,98 @@
 logger = logging.getLogger(__name__)
 
 
+class WeightOnlyQuantConfig:
+    def __init__(self, algorithm):
+        """This is the Base class for Weight Only Quant Configuration.
+
+        Args:
+            algorithm:
+                weight only quantize algorithm name.
+        """
+        self.algorithm = algorithm
+
+
+class RTNWeightOnlyQuantConfig(WeightOnlyQuantConfig):
+    def __init__(
+        self,
+        ratios=None,
+    ):
+        """
+        This is a class for round-to-nearest (RTN) algorithm Weight Only Quant Configuration.
+        RTN is the most straightforward way to quantize weight using scale maps.
+
+        Args:
+            ratios:
+                percentile of clip. Defaults to {}.
+        """
+        if ratios is None:
+            ratios = {}
+        super().__init__(
+            algorithm="RTN",
+        )
+        self.ratios = ratios
+
+
+class GPTQWeightOnlyQuantConfig(WeightOnlyQuantConfig):
+    def __init__(
+        self,
+        calibration_data_reader: CalibrationDataReader,
+        percdamp=0.01,
+        blocksize=128,
+        actorder=False,
+        mse=False,
+        perchannel=True,
+    ):
+        """
+        This is a class for GPTQ algorithm Weight Only Quant Configuration.
+        GPTQ algorithm provides more accurate quantization but requires more computational resources.
+
+        Args:
+            calibration_data_reader:
+                a calibration data reader. It enumerates calibration data and generates inputs for the original model.
+            percdamp:
+                percent of the average Hessian diagonal to use for dampening.
+            blocksize (int, optional):
+                channel number in one block to execute a GPTQ quantization iteration.
+            actorder (bool, optional):
+                whether rearrange Hessian matrix considering the diag's value.
+            mse (bool, optional):
+                whether get scale and zero point with mse error.
+            perchannel (bool, optional):
+                whether quantize weight per-channel.
+        """
+        super().__init__(
+            algorithm="GPTQ",
+        )
+        self.calibration_data_reader = calibration_data_reader
+        self.percdamp = percdamp
+        self.blocksize = blocksize
+        self.actorder = actorder
+        self.mse = mse
+        self.perchannel = perchannel
+
+
 class MatMul4BitsQuantizer:
     """Perform 4b quantization of constant MatMul weights"""
 
     def __init__(
         self,
-        model: ModelProto,
+        model: ModelProto | str,
         block_size: int,
         is_symmetric: bool,
         accuracy_level: int | None = None,
-        nodes_to_exclude: list[str] | None = None,
+        nodes_to_exclude=None,
+        algo_config: WeightOnlyQuantConfig = None,
     ):
         if nodes_to_exclude is None:
             nodes_to_exclude = []
-        self.model = ONNXModel(model)
+        self.model = ONNXModel(onnx.load(model)) if isinstance(model, str) else ONNXModel(model)
+        self.model_path = model if isinstance(model, str) else None
         self.block_size = block_size
         self.is_symmetric = is_symmetric
         self.accuracy_level = accuracy_level
         self.nodes_to_exclude = set(nodes_to_exclude)
+        self.algo_config = algo_config
 
     @staticmethod
     def __get_initializer(name, graph_path: list[GraphProto]) -> tuple[TensorProto, GraphProto]:
@@ -176,20 +254,99 @@ def _process_subgraph(self, graph_stack: list[GraphProto]):
         graph_stack.pop()
         return graph
 
+    def _generate_q4_node_config(self):
+        """Generate weight only quant configuration for nodes."""
+        q4_node_config = {}
+        template_config_q4 = {
+            "bits": 4,
+            "group_size": self.block_size,
+            "scheme": "sym" if self.is_symmetric else "asym",
+        }
+        for node in self.model.model.graph.node:
+            if node.op_type in ["MatMul"]:
+                if not all([self.model.get_initializer(i) is None for i in node.input]):
+                    q4_node_config[node.name] = template_config_q4
+        return q4_node_config
+
+    def int4_quant_algo(self):
+        """4b quantize a model with RTN or GPTQ algorithm. Please refer to
+        https://github.com/intel/neural-compressor/blob/master/docs/source/quantization_weight_only.md
+        for more details on weight only quantization using Intel® Neural Compressor.
+        """
+
+        def inc_dataloader():
+            data_reader = copy.deepcopy(self.algo_config.calibration_data_reader)
+            for data in data_reader:
+                yield data, None
+
+        kwargs = {}
+        if self.accuracy_level is not None:
+            kwargs["accuracy_level"] = self.accuracy_level
+        weight_only_node_config = self._generate_q4_node_config()
+
+        algorithm = self.algo_config.algorithm
+        logger.info(f"start to quantize model with {algorithm} algorithm...")
+        if algorithm == "RTN":
+            from neural_compressor.adaptor.ox_utils.weight_only import rtn_quantize
+
+            kwargs["ratios"] = self.algo_config.ratios
+
+            self.model = rtn_quantize(
+                model=self.model_path if self.model_path is not None else self.model.model,
+                weight_config=weight_only_node_config,
+                **kwargs,
+            )
+        elif algorithm == "GPTQ":
+            from neural_compressor.adaptor.ox_utils.weight_only import gptq_quantize
+
+            kwargs["percdamp"] = self.algo_config.percdamp
+            kwargs["blocksize"] = self.algo_config.blocksize
+            kwargs["actorder"] = self.algo_config.actorder
+            kwargs["mse"] = self.algo_config.mse
+            kwargs["perchannel"] = self.algo_config.perchannel
+            kwargs["n_samples"] = -1
+            dataloader = inc_dataloader()
+
+            self.model = gptq_quantize(
+                model=self.model_path if self.model_path is not None else self.model.model,
+                weight_config=weight_only_node_config,
+                dataloader=dataloader,
+                **kwargs,
+            )
+        logger.info(f"complete quantization of model with {algorithm} algorithm.")
+
     def process(self):
-        # use a stack to keep track of sub-graphs
-        graph_stack = [self.model.graph()]
-        opset_import = self.model.opset_import()
-
-        has_ms_domain = False
-        for opset in opset_import:
-            if opset.domain == "com.microsoft":
-                has_ms_domain = True
-        if not has_ms_domain:
-            opset_import.extend([onnx.helper.make_opsetid("com.microsoft", 1)])
-
-        self._process_subgraph(graph_stack)
-        self.model.clean_initializers()
+        if self.algo_config is None:
+            # use a stack to keep track of sub-graphs
+            graph_stack = [self.model.graph()]
+            opset_import = self.model.opset_import()
+
+            has_ms_domain = False
+            for opset in opset_import:
+                if opset.domain == "com.microsoft":
+                    has_ms_domain = True
+            if not has_ms_domain:
+                opset_import.extend([onnx.helper.make_opsetid("com.microsoft", 1)])
+
+            self._process_subgraph(graph_stack)
+            self.model.clean_initializers()
+        else:
+            # use Intel® Neural Compressor for RTN or GPTQ weight-only quantize algorithm
+            try:
+                importlib.import_module("neural_compressor")
+            except Exception as e:
+                logging.error(f"{e}.")
+                raise RuntimeError(
+                    "neural-compressor is not correctly installed. Please check your environment."
+                ) from e
+
+            import neural_compressor
+
+            assert version.parse(neural_compressor.__version__) >= version.parse(
+                "2.3.2"
+            ), "Require neural-compressor >= 2.3.2 to support weight only quantization!"
+
+            self.int4_quant_algo()
 
 
 def parse_args():
diff --git a/onnxruntime/python/tools/quantization/quantize.py b/onnxruntime/python/tools/quantization/quantize.py
index aed46563c2764..1bd2ef42151d0 100644
--- a/onnxruntime/python/tools/quantization/quantize.py
+++ b/onnxruntime/python/tools/quantization/quantize.py
@@ -466,7 +466,6 @@ def quantize_static(
 
         import copy
 
-        import onnx
         from neural_compressor.adaptor.ox_utils.smooth_quant import ORTSmoothQuant
 
         def inc_dataloader():
@@ -478,13 +477,11 @@ def inc_dataloader():
         dataloader = inc_dataloader()
         sq = ORTSmoothQuant(model_input, dataloader, reduce_range)
         del dataloader
-        model = sq.transform(
-            extra_options.get("SmoothQuantAlpha", 0.5), extra_options.get("SmoothQuantFolding", True)
-        ).model
-        nodes_to_exclude.extend([i.name for i in model.graph.node if i.name not in orig_nodes])
+        model = sq.transform(extra_options.get("SmoothQuantAlpha", 0.5), extra_options.get("SmoothQuantFolding", True))
         sq_path = tempfile.TemporaryDirectory(prefix="ort.quant.")
-        model_input = Path(sq_path.name).joinpath("sq_model.onnx").as_posix()
-        onnx.save_model(model, model_input, save_as_external_data=True)
+        model_input = Path(sq_path).joinpath("sq_model.onnx").as_posix()
+        model.save(model_input)
+        nodes_to_exclude.extend([i.name for i in model.model.graph.node if i.name not in orig_nodes])
         model = load_model_with_shape_infer(Path(model_input))  # use smooth quant model for calibration
 
     with tempfile.TemporaryDirectory(prefix="ort.quant.") as quant_tmp_dir:
diff --git a/onnxruntime/test/python/quantization/test_op_matmul_4bits.py b/onnxruntime/test/python/quantization/test_op_matmul_4bits.py
index 02f51cc4fa809..73dae08af8ece 100644
--- a/onnxruntime/test/python/quantization/test_op_matmul_4bits.py
+++ b/onnxruntime/test/python/quantization/test_op_matmul_4bits.py
@@ -71,13 +71,16 @@ def construct_model_matmul(self, output_model_path: str, symmetric: bool) -> Non
         output_name = "output"
         initializers = []
 
-        def make_matmul(input_name, weight_shape: Union[int, Tuple[int, ...]], weight_name: str, output_name: str):
+        def make_matmul(
+            input_name, weight_shape: Union[int, Tuple[int, ...]], weight_name: str, output_name: str, node_name: str
+        ):
             weight_data = self.fill_int4_data(weight_shape, symmetric).astype(np.float32)
             initializers.append(onnx.numpy_helper.from_array(weight_data, name=weight_name))
             return onnx.helper.make_node(
                 "MatMul",
                 [input_name, weight_name],
                 [output_name],
+                node_name,
             )
 
         in_features = 52
@@ -88,6 +91,7 @@ def make_matmul(input_name, weight_shape: Union[int, Tuple[int, ...]], weight_na
             [in_features, out_features],
             "linear1.weight",
             output_name,
+            "MatMul_0",
         )
 
         # make graph
@@ -139,6 +143,48 @@ def quant_test(
             else:
                 raise exception
 
+    def quant_test_with_algo(
+        self,
+        algorithm: str,
+        model_fp32_path: str,
+        data_reader: TestDataFeeds,
+        block_size: int,
+        is_symmetric: bool,
+    ):
+        model_int4_path = str(
+            Path(self._tmp_model_dir.name).joinpath(f"MatMulNBits_{block_size}_{is_symmetric}.onnx").absolute()
+        )
+
+        # Quantize fp32 model to int4 model
+        from onnxruntime.quantization import matmul_4bits_quantizer
+
+        algo_config = None
+        if algorithm == "RTN":
+            # test RTN algorithm
+            algo_config = matmul_4bits_quantizer.RTNWeightOnlyQuantConfig()
+        elif algorithm == "GPTQ":
+            # test GPTQ algorithm
+            algo_config = matmul_4bits_quantizer.GPTQWeightOnlyQuantConfig(calibration_data_reader=data_reader)
+
+        model = quant_utils.load_model_with_shape_infer(Path(model_fp32_path))
+        quant = matmul_4bits_quantizer.MatMul4BitsQuantizer(model, block_size, is_symmetric, algo_config=algo_config)
+        quant.process()
+        quant.model.save_model_to_file(model_int4_path, False)
+
+        quant_nodes = {"MatMulNBits": 1}
+        check_op_type_count(self, model_int4_path, **quant_nodes)
+
+        data_reader.rewind()
+
+        try:
+            check_model_correctness(self, model_fp32_path, model_int4_path, data_reader.get_next())
+        except Exception as exception:
+            if "4b quantization not yet supported on this hardware platform!" in exception.args[0]:
+                # Currently we don't have int4 quantization support on all platforms, has to tolerate this exception
+                pass
+            else:
+                raise exception
+
     @unittest.skipIf(
         find_spec("onnxruntime.training"), "Skip because training package doesn't has quantize_matmul_4bits"
     )
@@ -159,6 +205,28 @@ def test_quantize_matmul_int4_offsets(self):
         data_reader = self.input_feeds(1, {"input": [100, 52]})
         self.quant_test(model_fp32_path, data_reader, 32, False)
 
+    @unittest.skipIf(
+        find_spec("onnxruntime.training"), "Skip because training package doesn't has quantize_matmul_4bits"
+    )
+    def test_quantize_matmul_int4_using_rtn_algo(self):
+        if not find_spec("neural_compressor"):
+            self.skipTest("skip test_smooth_quant since neural_compressor is not installed")
+        model_fp32_path = str(Path(self._tmp_model_dir.name).joinpath("matmul_fp32_offset.onnx").absolute())
+        self.construct_model_matmul(model_fp32_path, symmetric=False)
+        data_reader = self.input_feeds(1, {"input": [100, 52]})
+        self.quant_test_with_algo("RTN", model_fp32_path, data_reader, 32, False)
+
+    @unittest.skipIf(
+        find_spec("onnxruntime.training"), "Skip because training package doesn't has quantize_matmul_4bits"
+    )
+    def test_quantize_matmul_int4_using_gptq_algo(self):
+        if not find_spec("neural_compressor"):
+            self.skipTest("skip test_smooth_quant since neural_compressor is not installed")
+        model_fp32_path = str(Path(self._tmp_model_dir.name).joinpath("matmul_fp32_offset.onnx").absolute())
+        self.construct_model_matmul(model_fp32_path, symmetric=False)
+        data_reader = self.input_feeds(1, {"input": [100, 52]})
+        self.quant_test_with_algo("GPTQ", model_fp32_path, data_reader, 32, False)
+
 
 if __name__ == "__main__":
     unittest.main()

From e58319ebfc344419b94ab5f8f27f7ce5eabe56f5 Mon Sep 17 00:00:00 2001
From: Yifan Li <109183385+yf711@users.noreply.github.com>
Date: Wed, 10 Jan 2024 15:29:34 -0800
Subject: [PATCH 28/29] [TensorRT EP] Fix memleak (#19053)

### Description
<!-- Describe your changes. -->
To fix memleak:
```bash
192 bytes in 1 blocks are definitely lost in loss record 1,254 of 1,999
   at 0x483BE63: operator new(unsigned long) (in /usr/lib/x86_64-linux-gnu/valgrind/vgpreload_memcheck-amd64-linux.so)
   by 0x4A93FD5: OrtApis::CreateTensorRTProviderOptions(OrtTensorRTProviderOptionsV2**) (in /code/onnxruntime/build/Linux/Release/libonnxruntime.so.1.17.0)
   by 0x1502E1: onnxruntime::perftest::OnnxRuntimeTestSession::OnnxRuntimeTestSession(Ort::Env&, std::random_device&, onnxruntime::perftest::PerformanceTestConfig const&, TestModelInfo const&) (in /code/onnxruntime/build/Linux/Release/onnxruntime_perf_test)
   by 0x15A404: onnxruntime::perftest::PerformanceRunner::PerformanceRunner(Ort::Env&, onnxruntime::perftest::PerformanceTestConfig const&, std::random_device&) (in /code/onnxruntime/build/Linux/Release/onnxruntime_perf_test)
   by 0x14C6D9: real_main(int, char**) (in /code/onnxruntime/build/Linux/Release/onnxruntime_perf_test)
   by 0x145A2A: main (in /code/onnxruntime/build/Linux/Release/onnxruntime_perf_test)
```

add ptr to help release trtep provider options


### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
---
 onnxruntime/test/perftest/ort_test_session.cc | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/onnxruntime/test/perftest/ort_test_session.cc b/onnxruntime/test/perftest/ort_test_session.cc
index ac25c98b15758..13082fe69cf48 100644
--- a/onnxruntime/test/perftest/ort_test_session.cc
+++ b/onnxruntime/test/perftest/ort_test_session.cc
@@ -170,6 +170,8 @@ OnnxRuntimeTestSession::OnnxRuntimeTestSession(Ort::Env& env, std::random_device
     const auto& api = Ort::GetApi();
     OrtTensorRTProviderOptionsV2* tensorrt_options;
     Ort::ThrowOnError(api.CreateTensorRTProviderOptions(&tensorrt_options));
+    std::unique_ptr<OrtTensorRTProviderOptionsV2, decltype(api.ReleaseTensorRTProviderOptions)> rel_trt_options(
+        tensorrt_options, api.ReleaseTensorRTProviderOptions);
     std::vector<const char*> option_keys, option_values;
     // used to keep all option keys and value strings alive
     std::list<std::string> buffer;

From fd6bab4250c41a7f6498e6fa02ba446bc74e0a8d Mon Sep 17 00:00:00 2001
From: Jiajia Qin <jiajia.qin@intel.com>
Date: Thu, 11 Jan 2024 08:12:43 +0800
Subject: [PATCH 29/29] [js/webgpu] Provide a vectorized algorithm for
 GroupedConv (#18884)

### Description
This PR provides a vectorized algorithm for NHWC GroupedConv to improve
performance.

The aggregate time of GroupedConv in mobilenetv2-12 becomes ~1ms from
~4ms on Intel Alder Lake machine. About 20% improvement for the whole
model.
---
 .../lib/wasm/jsep/webgpu/ops/conv-grouped.ts  |  99 +++++++++++-
 js/web/lib/wasm/jsep/webgpu/ops/conv.ts       |  26 ++-
 js/web/test/data/ops/conv.jsonc               | 152 +++++++++++++++++-
 3 files changed, 271 insertions(+), 6 deletions(-)

diff --git a/js/web/lib/wasm/jsep/webgpu/ops/conv-grouped.ts b/js/web/lib/wasm/jsep/webgpu/ops/conv-grouped.ts
index 14482272bad38..21b4953d3f90c 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/conv-grouped.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/conv-grouped.ts
@@ -3,9 +3,9 @@
 
 import {TensorView} from '../../tensor-view';
 import {ShapeUtil} from '../../util';
-import {ProgramInfo} from '../types';
+import {ProgramInfo, ProgramUniform} from '../types';
 
-import {inputVariable, outputVariable, ShaderHelper} from './common';
+import {createTensorShapeVariables, getMaxComponents, inputVariable, outputVariable, ShaderHelper} from './common';
 import {calculateOutputShape, ConvAttributes} from './conv';
 import {getActivationSnippet} from './fuse-utils';
 
@@ -95,3 +95,98 @@ export const createGroupedConvProgramInfo =
         getShaderSource,
       };
     };
+
+export const createGroupedConvVectorizeProgramInfo =
+    (inputs: readonly TensorView[], attributes: ConvAttributes, outputShape: readonly number[]): ProgramInfo => {
+      const hasBias = inputs.length > 2;
+      const components = getMaxComponents(outputShape[3]);
+      const outputNumber = getMaxComponents(outputShape[2]);
+      const outputSize = ShapeUtil.size(outputShape) / components / outputNumber;
+      const xShape = [inputs[0].dims[0], inputs[0].dims[1], inputs[0].dims[2], inputs[0].dims[3] / components];
+      const wShape = [inputs[1].dims[0], inputs[1].dims[1], inputs[1].dims[2], inputs[1].dims[3] / components];
+      const outputShapeInShader = [outputShape[0], outputShape[1], outputShape[2], outputShape[3] / components];
+
+      const programUniforms: ProgramUniform[] = [
+        {type: 'uint32', data: outputSize}, {type: 'int32', data: attributes.strides},
+        {type: 'int32', data: attributes.pads}, ...createTensorShapeVariables(xShape),
+        ...createTensorShapeVariables(wShape), ...createTensorShapeVariables(outputShapeInShader)
+      ];
+      const xNumber = (outputNumber - 1) * attributes.strides[1] + wShape[1];
+      const getShaderSource = (shaderHelper: ShaderHelper) => {
+        const output = outputVariable('output', inputs[0].dataType, outputShapeInShader.length, components);
+        const {activationFunction, applyActivation} = getActivationSnippet(attributes, output.type.value);
+        const x = inputVariable('x', inputs[0].dataType, xShape.length, components);
+        const w = inputVariable('w', inputs[1].dataType, wShape.length, components);
+        const inputVars = [x, w];
+        if (hasBias) {
+          inputVars.push(inputVariable('b', inputs[2].dataType, inputs[2].dims, components));
+        }
+        const processBias = hasBias ? 'value += b[output_channel];' : '';
+
+        return `
+  ${
+            shaderHelper.registerUniform('output_size', 'u32')
+                .registerUniform('strides', 'i32', 2)
+                .registerUniform('pads', 'i32', 2)
+                .declareVariables(...inputVars, output)}
+  ${activationFunction}
+  ${shaderHelper.mainStart()}
+    ${shaderHelper.guardAgainstOutOfBoundsWorkgroupSizes('uniforms.output_size')}
+    let width0 = uniforms.output_shape[3];
+    let output_channel = global_idx % width0;
+    var index1 = global_idx / width0;
+    let width1 = uniforms.output_shape[2] / ${outputNumber}u;
+    let col = (index1 % width1) * ${outputNumber}u;
+    index1 = index1 / width1;
+    let row = index1 % uniforms.output_shape[1];
+    let batch = index1 / uniforms.output_shape[1];
+
+    let x_corner = vec2<i32>(i32(row), i32(col)) * uniforms.strides - uniforms.pads;
+
+    var x_vals: array<${x.type.value}, ${xNumber}>;
+    var values: array<${output.type.value}, ${outputNumber}>;
+    let input_channel = output_channel;
+    // Use constant instead of uniform can give better performance for w's height/width.
+    for (var w_height: u32 = 0u; w_height < ${wShape[0]}; w_height++) {
+      let x_height = x_corner.x + i32(w_height);
+      if (x_height >= 0 || u32(x_height) < uniforms.x_shape[1]) {
+        for (var i = 0; i < ${xNumber}; i++) {
+          let x_width = x_corner.y + i;
+          if (x_width >= 0 && u32(x_width) < uniforms.x_shape[2]) {
+            x_vals[i] = ${x.get('batch', 'u32(x_height)', 'u32(x_width)', 'input_channel')};
+          } else {
+            x_vals[i] = ${x.type.value}(0);
+          }
+        }
+        for (var w_width: u32 = 0u; w_width < ${wShape[1]}; w_width++) {
+          let w_val = ${w.get('w_height', 'w_width', '0', 'output_channel')};
+          for (var i = 0u; i < ${outputNumber}u; i++) {
+            values[i] = fma(x_vals[i * ${attributes.strides[1]}u + w_width], w_val, values[i]);
+          }
+        }
+      }
+    }
+
+    for (var i = 0u; i < ${outputNumber}u; i++) {
+      var value = values[i];
+      ${processBias}
+      ${applyActivation}
+      ${output.set('batch', 'row', 'col + i', 'output_channel', 'value')};
+    }
+  }`;
+      };
+
+      return {
+        name: 'GroupedConv-Vectorize',
+        shaderCache: {
+          hint: `${attributes.activationCacheKey};${components};${outputNumber};${xNumber};${wShape[0]};${wShape[1]}`,
+          inputDependencies: hasBias ? ['rank', 'rank', 'type'] : ['rank', 'rank']
+        },
+        getRunData: () => ({
+          outputs: [{dims: outputShape, dataType: inputs[0].dataType}],
+          dispatchGroup: {x: Math.ceil(outputSize / 64 /* workgroup size */)},
+          programUniforms
+        }),
+        getShaderSource,
+      };
+    };
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/conv.ts b/js/web/lib/wasm/jsep/webgpu/ops/conv.ts
index 33a5db7ff6b25..cb40a9f08d2d7 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/conv.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/conv.ts
@@ -8,7 +8,7 @@ import {ComputeContext} from '../types';
 
 import {createConv2DMatMulProgramInfo} from './3rd-party/conv2d_mm_webgpu';
 import {createMatmulProgramInfo} from './3rd-party/matmul_packed_webgpu';
-import {createGroupedConvProgramInfo} from './conv-grouped';
+import {createGroupedConvProgramInfo, createGroupedConvVectorizeProgramInfo} from './conv-grouped';
 import {InternalActivationAttributes, parseInternalActivationAttributes} from './fuse-utils';
 import {createNaiveMatmulProgramInfo} from './matmul';
 import {createTransposeProgramInfo} from './transpose';
@@ -136,12 +136,32 @@ const conv2d = (context: ComputeContext, inputs: readonly TensorView[], attribut
   // check attributes
 
   // const hasPreluActivationWeights = false; /* TODO: add support for prelu activation weights */
+  const isChannelsLast = attributes.format === 'NHWC';
   if (attributes.group !== 1) {
-    context.compute(createGroupedConvProgramInfo(inputs, adjustedAttributes));
+    if (isChannelsLast && inputs[1].dims[0] === attributes.group && inputs[1].dims[1] === 1 &&
+        attributes.dilations[0] === 1 && attributes.dilations[1] === 1) {
+      const outputShape = calculateOutputShape(
+          inputs[0].dims, inputs[1].dims, attributes.dilations, adjustedAttributes.pads, attributes.strides,
+          isChannelsLast);
+      const transposedWeight = (context.kernelCustomData.wT as TensorView | undefined) ??
+          context.compute(
+              createTransposeProgramInfo(inputs[1], weightTransposeAttribute),
+              {inputs: [1], outputs: [attributes.wIsConst ? -2 : -1]})[0];
+      if (attributes.wIsConst && !context.kernelCustomData.wT) {
+        context.kernelCustomData.wT = transposedWeight;
+      }
+      const convInputs = [inputs[0], transposedWeight];
+      if (inputs.length === 3) {
+        convInputs.push(inputs[2]);
+      }
+      context.compute(
+          createGroupedConvVectorizeProgramInfo(convInputs, adjustedAttributes, outputShape), {inputs: convInputs});
+    } else {
+      context.compute(createGroupedConvProgramInfo(inputs, adjustedAttributes));
+    }
     return;
   }
 
-  const isChannelsLast = attributes.format === 'NHWC';
   const hasBias = inputs.length === 3;
   const inputHeight = inputs[0].dims[isChannelsLast ? 1 : 2];
   const inputWidth = inputs[0].dims[isChannelsLast ? 2 : 3];
diff --git a/js/web/test/data/ops/conv.jsonc b/js/web/test/data/ops/conv.jsonc
index 2e8eaaba191d0..cc10df5864233 100644
--- a/js/web/test/data/ops/conv.jsonc
+++ b/js/web/test/data/ops/conv.jsonc
@@ -298,7 +298,157 @@
       }
     ]
   },
-
+  {
+    "name": "conv - vectorize group - A",
+    "operator": "Conv",
+    "inputShapeDefinitions": "rankOnly",
+    "opset": { "domain": "", "version": 17 },
+    "attributes": [
+      { "name": "kernel_shape", "data": [1, 1], "type": "ints" },
+      { "name": "group", "data": 2, "type": "int" }
+    ],
+    "cases": [
+      {
+        "name": "T[0]",
+        "inputs": [
+          {
+            "data": [0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0],
+            "dims": [1, 2, 3, 3],
+            "type": "float32"
+          },
+          {
+            "data": [1.0, 2.0],
+            "dims": [2, 1, 1, 1],
+            "type": "float32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 18.0, 20.0, 22.0, 24.0, 26.0, 28.0, 30.0, 32.0, 34.0],
+            "dims": [1, 2, 3, 3],
+            "type": "float32"
+          }
+        ]
+      }
+    ]
+  },
+  {
+    "name": "conv - vectorize group - B",
+    "operator": "Conv",
+    "inputShapeDefinitions": "rankOnly",
+    "opset": { "domain": "", "version": 17 },
+    "attributes": [
+      { "name": "kernel_shape", "data": [2, 2], "type": "ints" },
+      { "name": "group", "data": 3, "type": "int" }
+    ],
+    "cases": [
+      {
+        "name": "T[0]",
+        "inputs": [
+          {
+            "data": [
+              0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0,
+              19.0, 20.0, 21.0, 22.0, 23.0, 0, 0, 0
+            ],
+            "dims": [1, 3, 3, 3],
+            "type": "float32"
+          },
+          {
+            "data": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0],
+            "dims": [3, 1, 2, 2],
+            "type": "float32"
+          },
+          {
+            "data": [0.1, 0.2, 0.3],
+            "dims": [3],
+            "type": "float32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [27.1, 37.1, 57.1, 67.1, 293.2, 319.2, 371.2, 397.2, 847.3, 889.3, 409.3, 428.3],
+            "dims": [1, 3, 2, 2],
+            "type": "float32"
+          }
+        ]
+      }
+    ]
+  },
+  {
+    "name": "conv - vectorize group - C",
+    "operator": "Conv",
+    "inputShapeDefinitions": "rankOnly",
+    "opset": { "domain": "", "version": 17 },
+    "attributes": [
+      { "name": "kernel_shape", "data": [2, 2], "type": "ints" },
+      { "name": "group", "data": 3, "type": "int" }
+    ],
+    "cases": [
+      {
+        "name": "T[0]",
+        "inputs": [
+          {
+            "data": [
+              0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0,
+              19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0
+            ],
+            "dims": [1, 3, 3, 4],
+            "type": "float32"
+          },
+          {
+            "data": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0],
+            "dims": [3, 1, 2, 2],
+            "type": "float32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [34, 44, 54, 74, 84, 94, 386, 412, 438, 490, 516, 542, 1122, 1164, 1206, 1290, 1332, 1374],
+            "dims": [1, 3, 2, 3],
+            "type": "float32"
+          }
+        ]
+      }
+    ]
+  },
+  {
+    "name": "conv - vectorize group - D",
+    "operator": "Conv",
+    "inputShapeDefinitions": "rankOnly",
+    "opset": { "domain": "", "version": 17 },
+    "attributes": [
+      { "name": "kernel_shape", "data": [2, 2], "type": "ints" },
+      { "name": "group", "data": 3, "type": "int" },
+      { "name": "strides", "data": [2, 2], "type": "ints" }
+    ],
+    "cases": [
+      {
+        "name": "T[0] strides = [2, 2]",
+        "inputs": [
+          {
+            "data": [
+              0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0,
+              19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0
+            ],
+            "dims": [1, 3, 3, 4],
+            "type": "float32"
+          },
+          {
+            "data": [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0],
+            "dims": [3, 1, 2, 2],
+            "type": "float32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [34, 54, 386, 438, 1122, 1206],
+            "dims": [1, 3, 1, 2],
+            "type": "float32"
+          }
+        ]
+      }
+    ]
+  },
   {
     "name": "conv - pointwise",
     "operator": "Conv",